[llvm] e292650 - [AMDGPU] Aggressively fold immediates in SIShrinkInstructions

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Wed May 18 03:04:44 PDT 2022


Author: Jay Foad
Date: 2022-05-18T11:04:33+01:00
New Revision: e2926501d886ce8c1cd08db1d3a02c314c2f412d

URL: https://github.com/llvm/llvm-project/commit/e2926501d886ce8c1cd08db1d3a02c314c2f412d
DIFF: https://github.com/llvm/llvm-project/commit/e2926501d886ce8c1cd08db1d3a02c314c2f412d.diff

LOG: [AMDGPU] Aggressively fold immediates in SIShrinkInstructions

Fold immediates regardless of how many uses they have. This is expected
to increase overall code size, but decrease register usage.

Differential Revision: https://reviews.llvm.org/D114644

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
    llvm/test/CodeGen/AMDGPU/and.ll
    llvm/test/CodeGen/AMDGPU/bswap.ll
    llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/test/CodeGen/AMDGPU/ctpop16.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
    llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
    llvm/test/CodeGen/AMDGPU/fexp.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
    llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
    llvm/test/CodeGen/AMDGPU/fneg-combines.ll
    llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
    llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
    llvm/test/CodeGen/AMDGPU/fpow.ll
    llvm/test/CodeGen/AMDGPU/frem.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.log.ll
    llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.log10.ll
    llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
    llvm/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/madmk.ll
    llvm/test/CodeGen/AMDGPU/mul.i16.ll
    llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
    llvm/test/CodeGen/AMDGPU/operand-folding.ll
    llvm/test/CodeGen/AMDGPU/packed-fp32.ll
    llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
    llvm/test/CodeGen/AMDGPU/saddsat.ll
    llvm/test/CodeGen/AMDGPU/sdiv.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/test/CodeGen/AMDGPU/sra.ll
    llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
    llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
    llvm/test/CodeGen/AMDGPU/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/udivrem.ll
    llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/usubsat.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index d2a8cf7945a0c..fd1e57b1e9512 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -93,7 +93,7 @@ bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
   MachineOperand &Src0 = MI.getOperand(Src0Idx);
   if (Src0.isReg()) {
     Register Reg = Src0.getReg();
-    if (Reg.isVirtual() && MRI->hasOneUse(Reg)) {
+    if (Reg.isVirtual()) {
       MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
       if (Def && Def->isMoveImmediate()) {
         MachineOperand &MovSrc = Def->getOperand(1);
@@ -115,8 +115,8 @@ bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
         }
 
         if (ConstantFolded) {
-          assert(MRI->use_empty(Reg));
-          Def->eraseFromParent();
+          if (MRI->use_nodbg_empty(Reg))
+            Def->eraseFromParent();
           ++NumLiteralConstantsFolded;
           return true;
         }
@@ -739,11 +739,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
-      // FIXME: We also need to consider movs of constant operands since
-      // immediate operands are not folded if they have more than one use, and
-      // the operand folding pass is unaware if the immediate will be free since
-      // it won't know if the src == dest constraint will end up being
-      // satisfied.
+      // Try to use S_ADDK_I32 and S_MULK_I32.
       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
           MI.getOpcode() == AMDGPU::S_MUL_I32) {
         const MachineOperand *Dest = &MI.getOperand(0);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index aa29bf5ee65ff..fc79f3581a41d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -94,9 +94,8 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX8-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index 2280dc45e7de2..c8d3b6d7b894f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -80,11 +80,10 @@ define i8 @v_uaddo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: v_uaddo_i8:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -93,11 +92,10 @@ define i8 @v_uaddo_i8(i8 %a, i8 %b) {
 ; GFX8-LABEL: v_uaddo_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
@@ -123,11 +121,10 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: v_uaddo_i7:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0x7f
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -136,11 +133,10 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
 ; GFX8-LABEL: v_uaddo_i7:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
@@ -149,11 +145,10 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
 ; GFX9-LABEL: v_uaddo_i7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v1
@@ -576,11 +571,10 @@ define i8 @s_uaddo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: s_uaddo_i8:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -589,11 +583,10 @@ define i8 @s_uaddo_i8(i8 %a, i8 %b) {
 ; GFX8-LABEL: s_uaddo_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
@@ -619,11 +612,10 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: s_uaddo_i7:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0x7f
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -632,11 +624,10 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
 ; GFX8-LABEL: s_uaddo_i7:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
@@ -645,11 +636,10 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
 ; GFX9-LABEL: s_uaddo_i7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v1
@@ -966,11 +956,10 @@ define amdgpu_ps i32 @uaddo_i32_sv(i32 inreg %a, i32 %b) {
 define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
 ; GFX7-LABEL: uaddo_i16_sv:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s1, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -979,11 +968,10 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
 ;
 ; GFX8-LABEL: uaddo_i16_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s1, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 99d3bf4f2c4ed..4481a29839ac7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -559,12 +559,11 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 ; GFX6-LABEL: v_andn2_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -816,18 +815,17 @@ define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
 ; GFX6-LABEL: v_andn2_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, v4, v8
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX6-NEXT:    v_and_b32_e32 v4, v6, v8
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index a50950782f3c7..6d204b88a681e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -722,11 +722,10 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-LABEL: v_ashr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -845,15 +844,14 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
 ; GFX6-LABEL: ashr_v2i16_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX6-NEXT:    v_ashr_i32_e32 v0, s0, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s1
 ; GFX6-NEXT:    v_ashr_i32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -889,10 +887,9 @@ define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -935,25 +932,24 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-LABEL: v_ashr_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v4, v0
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v5
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, v4, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1098,46 +1094,44 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-LABEL: v_ashr_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v8, v0
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v9
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v9
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, v8, v1
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v10
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v10
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, v8, v2
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v11
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v11
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT:    v_mov_b32_e32 v16, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, v8, v3
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v12
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v12
 ; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, v8, v4
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v13
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v13
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v16
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, v8, v5
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v14
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v16
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, v8, v6
-; GFX6-NEXT:    v_and_b32_e32 v8, v15, v16
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v15
 ; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, v2, v16
-; GFX6-NEXT:    v_and_b32_e32 v2, v3, v16
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, v8, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, v5, v16
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v4, v16
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v7, v16
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, v6, v16
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index fb96801ca2cc3..e029e32c74d58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -66,9 +66,8 @@ define half @test_f16_sub_ext_neg_mul(half %x, half %y, half %z) {
 ; GFX9-CONTRACT-LABEL: test_f16_sub_ext_neg_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %entry
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT:    s_mov_b32 s4, 0x8000
-; GFX9-CONTRACT-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-CONTRACT-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX9-CONTRACT-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX9-CONTRACT-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index bb35b9c25d25a..a30df912a4e10 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -892,18 +892,18 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    s_lshr_b32 s1, s2, 2
-; GFX7-NEXT:    s_and_b32 s2, s2, 3
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX7-NEXT:    s_lshr_b32 s0, s2, 2
+; GFX7-NEXT:    s_and_b32 s1, s2, 3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_lshl_b32 s0, s1, 3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, s0, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v6, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
@@ -918,7 +918,6 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    s_lshl_b32 s0, s2, 3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -1019,7 +1018,6 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
@@ -1028,9 +1026,9 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
@@ -2291,58 +2289,56 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX7-NEXT:    s_lshr_b32 s1, s2, 2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX7-NEXT:    s_and_b32 s2, s2, 3
+; GFX7-NEXT:    s_lshr_b32 s0, s2, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_and_b32 s1, s2, 3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v9, s0, v0
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v11, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v2, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v13, v2, v4
+; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v15, v3, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v15
-; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX7-NEXT:    v_or_b32_e32 v11, v14, v15
+; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 2
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7-NEXT:    s_lshl_b32 s0, s2, 3
+; GFX7-NEXT:    s_lshl_b32 s0, s1, 3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -2500,57 +2496,55 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0xff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 2, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v11, v3, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v13, v4, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX7-NEXT:    v_and_b32_e32 v10, s4, v3
+; GFX7-NEXT:    v_bfe_u32 v10, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v12, v4, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v12, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v15, v5, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
-; GFX7-NEXT:    v_and_b32_e32 v14, v5, v0
+; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
-; GFX7-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v6
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX7-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT:    v_or_b32_e32 v3, v4, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v8
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
+; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v17
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v9
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v17
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 0a9107a912509..bcbc57de78687 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -866,8 +866,7 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
@@ -881,7 +880,7 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
@@ -1001,8 +1000,7 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
@@ -1016,7 +1014,7 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
@@ -1182,8 +1180,7 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
@@ -1197,7 +1194,7 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 9cb1f58f4ad32..75175a1aff0e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -75,15 +75,14 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_add_u32_e32 v1, 4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -137,14 +136,13 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s32
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_add_u32_e32 v1, s32, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, s32, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -305,14 +303,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x104
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x104, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -374,16 +371,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_add_u32_e32 v1, vcc_hi, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, vcc_hi, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -525,14 +522,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4004
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x4004, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -596,16 +592,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_add_u32_e32 v1, vcc_hi, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, vcc_hi, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
index 9f7fe19a17ed9..8fad64a10538a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
@@ -255,7 +255,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s2, 0x80000000
-; SI-NEXT:    v_sub_f32_e32 v2, s2, v2
+; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
 ; SI-NEXT:    v_sub_f32_e64 v4, s2, |v4|
 ; SI-NEXT:    v_med3_f32 v2, v2, |v3|, v4
 ; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
@@ -290,7 +290,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_sub_f32_e32 v4, s2, v7
+; VI-NEXT:    v_sub_f32_e32 v4, 0x80000000, v7
 ; VI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
 ; VI-NEXT:    v_med3_f32 v2, v4, |v2|, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -308,7 +308,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
-; GFX9-NEXT:    v_sub_f32_e32 v1, s2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_sub_f32_e64 v3, s2, |v3|
 ; GFX9-NEXT:    v_med3_f32 v1, v1, |v2|, v3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index 8a73d7f2b85a3..ab5dc734220db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -186,9 +186,8 @@ define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) {
 ; GFX8-LABEL: v_fmul_v4f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v3
@@ -223,9 +222,8 @@ define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) {
 ; GFX8-LABEL: v_fmul_v4f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
-; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v3
@@ -336,10 +334,9 @@ define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) {
 ; GFX8-LABEL: v_fmul_v6f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v3, v1, v4
@@ -381,10 +378,9 @@ define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) {
 ; GFX8-LABEL: v_fmul_v6f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v3
-; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v4
-; GFX8-NEXT:    v_xor_b32_e32 v5, s4, v5
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80008000, v5
 ; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v3, v1, v4
@@ -515,11 +511,10 @@ define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) {
 ; GFX8-LABEL: v_fmul_v8f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
-; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX8-NEXT:    v_mul_f16_e32 v8, v0, v4
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v1, v5
@@ -567,11 +562,10 @@ define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) {
 ; GFX8-LABEL: v_fmul_v8f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v4
-; GFX8-NEXT:    v_xor_b32_e32 v5, s4, v5
-; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v6
-; GFX8-NEXT:    v_xor_b32_e32 v7, s4, v7
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80008000, v5
+; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80008000, v6
+; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80008000, v7
 ; GFX8-NEXT:    v_mul_f16_e32 v8, v0, v4
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v1, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 7843173f11b37..90bb1b9a90276 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -398,19 +398,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_mov_b32 s4, 0x80008000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
@@ -427,11 +425,10 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_log_f16_e32 v2, v0
 ; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -450,15 +447,14 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    v_log_f16_e32 v2, v0
 ; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index cb407ee289800..71e8aaf6ea133 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -10,7 +10,6 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 0x7f
-; GFX6-NEXT:    s_movk_i32 s3, 0x7f
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x60001
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -27,8 +26,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 6, v0
-; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -41,10 +40,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0x7f
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0x7f
-; GFX8-NEXT:    s_movk_i32 s3, 0x7f
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX8-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
@@ -59,8 +57,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 6, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -73,10 +71,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0x7f
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0x7f
-; GFX9-NEXT:    s_movk_i32 s3, 0x7f
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
@@ -91,8 +88,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v1, 6, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
 ; GFX9-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -147,7 +144,6 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7f
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
@@ -157,9 +153,9 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 6, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -170,15 +166,14 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v4, -7, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
@@ -188,9 +183,9 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v3, 6, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -201,15 +196,14 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, -7, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
@@ -219,9 +213,9 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v3, 6, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -658,9 +652,8 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -996,23 +989,22 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_bfe_u32 v4, v1, 16, 8
 ; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v8
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 7, v8
 ; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1059,12 +1051,11 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1114,8 +1105,8 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, s4, v1
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
@@ -1189,10 +1180,9 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX6-NEXT:    s_and_b32 s2, s2, 0xffffff
-; GFX6-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -1206,8 +1196,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 23, v0
-; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1220,10 +1210,9 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0xffffff
-; GFX8-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -1237,8 +1226,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 23, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1251,10 +1240,9 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xffffff
-; GFX9-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
@@ -1268,8 +1256,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 23, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -1321,7 +1309,6 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
@@ -1331,9 +1318,9 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1352,7 +1339,6 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
@@ -1362,9 +1348,9 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1383,7 +1369,6 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
@@ -1393,8 +1378,8 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1438,11 +1423,11 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX6-NEXT:    s_and_b32 s10, s0, 0xff
+; GFX6-NEXT:    s_and_b32 s9, s0, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX6-NEXT:    s_or_b32 s0, s10, s0
+; GFX6-NEXT:    s_or_b32 s0, s9, s0
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
@@ -1461,12 +1446,12 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_or_b32 s1, s1, s6
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s2, 24
-; GFX6-NEXT:    s_and_b32 s10, s2, 0xff
+; GFX6-NEXT:    s_and_b32 s9, s2, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x80008
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX6-NEXT:    s_or_b32 s2, s10, s2
+; GFX6-NEXT:    s_or_b32 s2, s9, s2
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s8, s3, 8
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
@@ -1483,13 +1468,13 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_or_b32 s3, s3, s6
 ; GFX6-NEXT:    s_lshr_b32 s6, s4, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 24
-; GFX6-NEXT:    s_and_b32 s10, s4, 0xff
+; GFX6-NEXT:    s_and_b32 s9, s4, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x80008
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
-; GFX6-NEXT:    s_or_b32 s4, s10, s4
+; GFX6-NEXT:    s_or_b32 s4, s9, s4
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
@@ -1519,13 +1504,12 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    s_mov_b32 s6, 0xffffff
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX6-NEXT:    v_and_b32_e32 v2, s6, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1535,23 +1519,21 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX6-NEXT:    s_lshr_b32 s0, s3, 1
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    s_movk_i32 s9, 0xff
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s9, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s9, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_bfe_u32 v2, v1, 8, 8
@@ -1653,13 +1635,12 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX8-NEXT:    s_mov_b32 s6, 0xffffff
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX8-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX8-NEXT:    v_and_b32_e32 v2, s6, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1669,11 +1650,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s1
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -1784,32 +1764,30 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s7, 0xffffff
-; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v0
+; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v0
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, s7, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, s7, v0
-; GFX9-NEXT:    v_lshrrev_b32_e64 v3, v3, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v0, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffff
-; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v1
 ; GFX9-NEXT:    s_lshr_b32 s0, s3, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
 ; GFX9-NEXT:    s_mov_b32 s6, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v1, s1, v1, v2
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    s_mov_b32 s8, 16
+; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -1961,18 +1939,17 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v9, v9
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v7, v6
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v9
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX6-NEXT:    v_mov_b32_e32 v9, 0xffffff
-; GFX6-NEXT:    v_and_b32_e32 v5, v5, v9
-; GFX6-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GFX6-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v9
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v7, v8
@@ -1987,9 +1964,9 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 23, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX6-NEXT:    v_and_b32_e32 v4, v6, v9
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v7, 24
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2001,10 +1978,10 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 23
-; GFX6-NEXT:    v_and_b32_e32 v3, v4, v9
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2016,18 +1993,17 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v9, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX8-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX8-NEXT:    v_mul_lo_u32 v8, v7, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v8, v9
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX8-NEXT:    v_mov_b32_e32 v9, 0xffffff
-; GFX8-NEXT:    v_and_b32_e32 v5, v5, v9
-; GFX8-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GFX8-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v9
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v7, v8
@@ -2042,9 +2018,9 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 23, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, v6, v9
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v7, 24
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2056,10 +2032,10 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 23, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_bfe_u32 v2, v3, 1, 23
-; GFX8-NEXT:    v_and_b32_e32 v3, v4, v9
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2078,15 +2054,15 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX9-NEXT:    v_bfe_u32 v3, v3, 1, 23
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX9-NEXT:    v_bfe_u32 v3, v3, 1, 23
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v9, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v5, v5, v8
+; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
@@ -2095,11 +2071,10 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v7, 23, v4
-; GFX9-NEXT:    v_and_b32_e32 v7, v7, v8
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
@@ -2111,8 +2086,8 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v8
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, v2, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3308,10 +3283,9 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX9-LABEL: v_fshl_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
@@ -3445,15 +3419,14 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ;
 ; GFX9-LABEL: v_fshl_v2i16_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v1, s2, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xf000f, v0
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 0x10001
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xf000f, v0
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s1, s0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v0, s0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -3939,17 +3912,16 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX9-LABEL: v_fshl_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v6, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xf000f, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v5
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v5
-; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
@@ -4703,10 +4675,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshl_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0x7f
-; GFX6-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
-; GFX6-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v14
 ; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v14
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v8
@@ -4752,10 +4723,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-LABEL: v_fshl_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v8
-; GFX8-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v14
 ; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v14
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
@@ -4801,10 +4771,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-LABEL: v_fshl_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v8
-; GFX9-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v14
 ; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v14
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
@@ -4902,10 +4871,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshl_i128_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_movk_i32 s8, 0x7f
-; GFX6-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v6
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v0
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v6
@@ -4955,10 +4923,9 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ;
 ; GFX8-LABEL: v_fshl_i128_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s8, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v6
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
@@ -5008,10 +4975,9 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ;
 ; GFX9-LABEL: v_fshl_i128_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s8, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX9-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v6
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
@@ -6022,8 +5988,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshl_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s6, 0x7f
-; GFX6-NEXT:    v_and_b32_e32 v23, s6, v16
+; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v23
 ; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
 ; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v23
@@ -6031,7 +5996,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX6-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
-; GFX6-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX6-NEXT:    v_lshr_b64 v[10:11], v[10:11], 1
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v17
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v24
@@ -6066,9 +6031,9 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_or_b32_e32 v1, v18, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v17, v8
 ; GFX6-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX6-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX6-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v20
-; GFX6-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX6-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v16
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v16
@@ -6114,8 +6079,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-LABEL: v_fshl_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s6, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v23, s6, v16
+; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v23
 ; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
@@ -6123,7 +6087,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX8-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
-; GFX8-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX8-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v17
 ; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v24
@@ -6158,9 +6122,9 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_or_b32_e32 v1, v18, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v17, v8
 ; GFX8-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX8-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX8-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v20
-; GFX8-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v16
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v16, v[6:7]
@@ -6206,8 +6170,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-LABEL: v_fshl_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s6, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v23, s6, v16
+; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v23
 ; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
@@ -6215,7 +6178,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX9-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
-; GFX9-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX9-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v17
 ; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v24
@@ -6250,9 +6213,9 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_or_b32_e32 v1, v18, v3
 ; GFX9-NEXT:    v_or_b32_e32 v2, v17, v8
 ; GFX9-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX9-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX9-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v20
-; GFX9-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX9-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v16
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v16, v[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 52f78974d9aa6..8fa8b58e22047 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -10,11 +10,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 0x7f
-; GFX6-NEXT:    s_movk_i32 s3, 0x7f
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX6-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -28,8 +27,8 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 6, v0
-; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -41,11 +40,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0x7f
-; GFX8-NEXT:    s_movk_i32 s3, 0x7f
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX8-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -59,8 +57,8 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 6, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -72,11 +70,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0x7f
-; GFX9-NEXT:    s_movk_i32 s3, 0x7f
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX9-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
@@ -90,8 +87,8 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v1, 6, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
 ; GFX9-NEXT:    v_lshrrev_b16_e64 v0, v0, s1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -139,14 +136,13 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v4, -7, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7f
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
@@ -156,8 +152,8 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 6, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7f, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -170,14 +166,13 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v4, -7, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
@@ -187,8 +182,8 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v3, 6, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0x7f, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -201,14 +196,13 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, -7, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
@@ -218,8 +212,8 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v3, 6, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7f, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -646,10 +640,9 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v4
@@ -660,7 +653,7 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -980,43 +973,42 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_and_b32_e32 v11, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v10, v10, v11
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v10
-; GFX6-NEXT:    v_and_b32_e32 v10, 7, v7
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v10, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v7
 ; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v7
 ; GFX6-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v7, v3
 ; GFX6-NEXT:    v_bfe_u32 v7, v1, 8, 8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, v10, v7
-; GFX6-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX6-NEXT:    v_and_b32_e32 v7, 7, v8
-; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v7
+; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v8
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX6-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 7, v8
+; GFX6-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v7, v1
-; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v9
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, 7, v9
-; GFX6-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v9
+; GFX6-NEXT:    v_and_b32_e32 v3, 7, v9
+; GFX6-NEXT:    v_and_b32_e32 v4, 7, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, v7, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v4, v6
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v3, v6
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, v4, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1061,12 +1053,11 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1115,8 +1106,8 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, s4, v1
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
@@ -1190,10 +1181,9 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX6-NEXT:    s_and_b32 s2, s2, 0xffffff
-; GFX6-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
@@ -1208,8 +1198,8 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 23, v0
-; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1222,10 +1212,9 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0xffffff
-; GFX8-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
@@ -1240,8 +1229,8 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 23, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1254,10 +1243,9 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xffffff
-; GFX9-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
@@ -1272,8 +1260,8 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 23, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v1, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -1322,12 +1310,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
@@ -1337,8 +1324,8 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1354,12 +1341,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
-; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
@@ -1369,8 +1355,8 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1383,15 +1369,14 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
@@ -1401,8 +1386,8 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1450,46 +1435,46 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_and_b32 s10, s0, 0xff
+; GFX6-NEXT:    s_and_b32 s9, s0, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
-; GFX6-NEXT:    s_or_b32 s0, s10, s0
+; GFX6-NEXT:    s_or_b32 s0, s9, s0
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
 ; GFX6-NEXT:    s_and_b32 s7, s8, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX6-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX6-NEXT:    s_and_b32 s12, s2, 0xff
+; GFX6-NEXT:    s_lshr_b32 s9, s2, 24
+; GFX6-NEXT:    s_and_b32 s11, s2, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x80008
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
-; GFX6-NEXT:    s_or_b32 s2, s12, s2
+; GFX6-NEXT:    s_or_b32 s2, s11, s2
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s11, s3, 8
+; GFX6-NEXT:    s_lshr_b32 s10, s3, 8
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX6-NEXT:    s_and_b32 s8, s11, 0xff
+; GFX6-NEXT:    s_and_b32 s8, s10, 0xff
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    s_or_b32 s3, s10, s3
+; GFX6-NEXT:    s_or_b32 s3, s9, s3
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s8
 ; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
-; GFX6-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX6-NEXT:    s_and_b32 s12, s4, 0xff
+; GFX6-NEXT:    s_lshr_b32 s9, s4, 24
+; GFX6-NEXT:    s_and_b32 s11, s4, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x80008
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
-; GFX6-NEXT:    s_or_b32 s4, s12, s4
+; GFX6-NEXT:    s_or_b32 s4, s11, s4
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
@@ -1498,7 +1483,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    s_lshr_b32 s11, s5, 8
+; GFX6-NEXT:    s_lshr_b32 s10, s5, 8
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v2
@@ -1507,9 +1492,9 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX6-NEXT:    s_and_b32 s8, s11, 0xff
+; GFX6-NEXT:    s_and_b32 s8, s10, 0xff
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    s_or_b32 s5, s10, s5
+; GFX6-NEXT:    s_or_b32 s5, s9, s5
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
@@ -1523,13 +1508,12 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX6-NEXT:    s_mov_b32 s8, 0xffffff
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v0
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_or_b32 s0, s4, s0
-; GFX6-NEXT:    v_and_b32_e32 v2, s8, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
@@ -1542,25 +1526,23 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
 ; GFX6-NEXT:    s_lshl_b32 s0, s7, 17
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX6-NEXT:    s_movk_i32 s9, 0xff
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s3, v1
 ; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s9, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s9, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_bfe_u32 v2, v1, 8, 8
@@ -1656,14 +1638,13 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
-; GFX8-NEXT:    s_mov_b32 s8, 0xffffff
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
-; GFX8-NEXT:    v_and_b32_e32 v2, s8, v3
-; GFX8-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
@@ -1676,13 +1657,12 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
 ; GFX8-NEXT:    s_lshl_b32 s0, s7, 17
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -1786,39 +1766,37 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
-; GFX9-NEXT:    s_mov_b32 s10, 0xffffff
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v0
 ; GFX9-NEXT:    s_lshl_b32 s4, s7, 17
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX9-NEXT:    v_and_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX9-NEXT:    s_or_b32 s0, s4, s0
-; GFX9-NEXT:    v_and_b32_e32 v3, s10, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v3, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s9, s9, 0x100000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffff
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v1
 ; GFX9-NEXT:    s_lshl_b32 s0, s9, 17
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX9-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    v_and_b32_e32 v3, v3, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
 ; GFX9-NEXT:    s_mov_b32 s6, 8
-; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v3, v1
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
 ; GFX9-NEXT:    s_mov_b32 s8, 16
+; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -1973,19 +1951,19 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v9
-; GFX6-NEXT:    v_mov_b32_e32 v9, 0xffffff
-; GFX6-NEXT:    v_and_b32_e32 v5, v5, v9
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -1995,15 +1973,14 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v7, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
-; GFX6-NEXT:    v_and_b32_e32 v7, v7, v9
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v8, v6
-; GFX6-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, v3, v9
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v6
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
@@ -2013,8 +1990,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, v4, v9
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -2030,19 +2007,19 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v8, v9
-; GFX8-NEXT:    v_mov_b32_e32 v9, 0xffffff
-; GFX8-NEXT:    v_and_b32_e32 v5, v5, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -2052,15 +2029,14 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v7, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, v7, v9
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v8, v6
-; GFX8-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, v3, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v5, v6
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
@@ -2070,8 +2046,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 23, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, v4, v9
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -2091,31 +2067,30 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v9, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v5, v5, v8
 ; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v8
-; GFX9-NEXT:    v_and_b32_e32 v3, v3, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v8
-; GFX9-NEXT:    v_and_b32_e32 v6, v6, v8
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v7
@@ -2126,8 +2101,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v8
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3215,10 +3190,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX9-LABEL: v_fshr_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
@@ -3381,13 +3355,12 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ;
 ; GFX9-LABEL: v_fshr_v2i16_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v1, s2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xf000f, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x10001
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xf000f, v0
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v0, s0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, s1
@@ -3894,12 +3867,11 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-LABEL: v_fshr_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v12, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX6-NEXT:    v_and_b32_e32 v8, v8, v12
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX6-NEXT:    v_or_b32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX6-NEXT:    v_and_b32_e32 v10, v10, v12
+; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
 ; GFX6-NEXT:    v_bfe_u32 v10, v4, 1, 15
@@ -4029,17 +4001,16 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX9-LABEL: v_fshr_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v6, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xf000f, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v5
-; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v5
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v4, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v2, v3
@@ -4820,11 +4791,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0x7f
-; GFX6-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
@@ -4869,11 +4839,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-LABEL: v_fshr_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
@@ -4918,11 +4887,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-LABEL: v_fshr_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
@@ -5019,11 +4987,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_movk_i32 s8, 0x7f
-; GFX6-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    s_mov_b32 s9, 0
-; GFX6-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
@@ -5072,11 +5039,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ;
 ; GFX8-LABEL: v_fshr_i128_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s8, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    s_mov_b32 s9, 0
-; GFX8-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
@@ -5125,11 +5091,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ;
 ; GFX9-LABEL: v_fshr_i128_ssv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s8, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    s_mov_b32 s9, 0
-; GFX9-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
@@ -6145,17 +6110,16 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshr_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s6, 0x7f
 ; GFX6-NEXT:    v_xor_b32_e32 v17, -1, v16
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v23, s6, v17
+; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v17
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v23
 ; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
 ; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v23
-; GFX6-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v24
 ; GFX6-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX6-NEXT:    v_or_b32_e32 v22, v18, v22
@@ -6191,7 +6155,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v18, v3
 ; GFX6-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX6-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX6-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v4
@@ -6204,7 +6168,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v17
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[8:9], v18
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX6-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX6-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v5, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc
@@ -6237,17 +6201,16 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-LABEL: v_fshr_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s6, 0x7f
 ; GFX8-NEXT:    v_xor_b32_e32 v17, -1, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v23, s6, v17
+; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v23
 ; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v24
 ; GFX8-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX8-NEXT:    v_or_b32_e32 v22, v18, v22
@@ -6283,7 +6246,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v1, v18, v3
 ; GFX8-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v4
@@ -6296,7 +6259,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v17, v[8:9]
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v18, v[8:9]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX8-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX8-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc
@@ -6329,17 +6292,16 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-LABEL: v_fshr_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s6, 0x7f
 ; GFX9-NEXT:    v_xor_b32_e32 v17, -1, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v23, s6, v17
+; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v17
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v23
 ; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v24
 ; GFX9-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX9-NEXT:    v_or_b32_e32 v22, v18, v22
@@ -6375,7 +6337,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v1, v18, v3
 ; GFX9-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX9-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX9-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
@@ -6388,7 +6350,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v17, v[8:9]
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v18, v[8:9]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX9-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX9-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
index dc08cd9bb8b1f..a6a3237ee929f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
@@ -82,9 +82,8 @@ define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
 ; The offset to the dynamic shared memory array should be aligned on the
 ; maximal one.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_4:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
-; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
+; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x48, [[IDX]]
 define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx
@@ -101,9 +100,8 @@ define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
 
 ; Honor the explicit alignment from the specified variable.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_5:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
-; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
+; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x44, [[IDX]]
 define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx
@@ -120,9 +118,8 @@ define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
 
 ; Honor the explicit alignment from the specified variable.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_6:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
-; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
+; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x50, [[IDX]]
 define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 17fcb5488684c..0491fc1f926bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -332,11 +332,10 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s1, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
@@ -547,12 +546,11 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v2i16_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
@@ -1013,15 +1011,14 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s2, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
@@ -1306,13 +1303,12 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v4i16_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
@@ -2001,20 +1997,19 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT:    s_mov_b32 s8, 0xffff
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -2380,15 +2375,14 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
@@ -3486,17 +3480,16 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT:    s_mov_b32 s20, 0xffff
 ; GFX7-NEXT:    v_mov_b32_e32 v9, s18
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, s20, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v10, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s20, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -4068,19 +4061,18 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 3e6bc4e425246..f2aca458d8d23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -51,17 +51,16 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -133,17 +132,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -217,15 +215,14 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -301,17 +298,16 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -386,15 +382,14 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_load_ushort v2, off, s[0:3], 0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -466,17 +461,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v3, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -546,15 +540,14 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -624,15 +617,14 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -868,10 +860,10 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-LABEL: insertelement_s_v4i8_v_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    s_movk_i32 s5, 0xff
-; GFX9-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, 8
 ; GFX9-NEXT:    s_mov_b32 s2, 16
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 24
@@ -938,17 +930,16 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-LABEL: insertelement_s_v4i8_v_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX7-NEXT:    s_movk_i32 s2, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s5, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
-; GFX7-NEXT:    s_and_b32 s3, s0, 0xff
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_and_b32 s2, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX7-NEXT:    s_or_b32 s3, s3, s5
+; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s3, s0
+; GFX7-NEXT:    s_or_b32 s0, s2, s0
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_and_b32 s1, s4, 3
@@ -959,7 +950,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -1088,28 +1079,27 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX7-NEXT:    s_movk_i32 s2, 0xff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s5, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
-; GFX7-NEXT:    s_and_b32 s3, s0, 0xff
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_and_b32 s2, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX7-NEXT:    s_or_b32 s3, s3, s5
+; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s3, s0
+; GFX7-NEXT:    s_or_b32 s0, s2, s0
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_and_b32 s1, s4, 0xff
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -1236,29 +1226,28 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-LABEL: insertelement_s_v4i8_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX7-NEXT:    s_movk_i32 s2, 0xff
 ; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
-; GFX7-NEXT:    s_and_b32 s3, s0, 0xff
-; GFX7-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX7-NEXT:    s_and_b32 s2, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX7-NEXT:    s_or_b32 s3, s3, s4
+; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s3, s0
+; GFX7-NEXT:    s_or_b32 s0, s2, s0
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s2, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -1387,37 +1376,36 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v2
 ; GFX7-NEXT:    s_and_b32 s0, s2, 0xff
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v3, s0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v2, v1
-; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1533,18 +1521,17 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    s_and_b32 s1, s2, 3
-; GFX7-NEXT:    v_and_b32_e32 v1, s0, v2
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s1, v1
-; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
-; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_and_b32 s0, s2, 3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
+; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
+; GFX7-NEXT:    s_not_b32 s0, s0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_bfe_u32 v4, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -1552,7 +1539,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
@@ -1678,38 +1665,36 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s2, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v2, s2, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v3, v1
-; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v5, s2, v0
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2131,64 +2116,62 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    s_lshr_b32 s1, s3, 2
-; GFX7-NEXT:    s_and_b32 s3, s3, 3
+; GFX7-NEXT:    s_and_b32 s1, s3, 3
+; GFX7-NEXT:    s_lshr_b32 s0, s3, 2
 ; GFX7-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX7-NEXT:    s_lshl_b32 s3, s3, 3
-; GFX7-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX7-NEXT:    s_lshl_b32 s3, 0xff, s3
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX7-NEXT:    s_not_b32 s3, s3
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_not_b32 s1, s1
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v5, s0, v0
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v7, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v3, s3, v3
-; GFX7-NEXT:    v_or_b32_e32 v3, s2, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v5, v0, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2373,7 +2356,6 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s6, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
@@ -2403,32 +2385,32 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xff, s4
 ; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
-; GFX7-NEXT:    v_or_b32_e32 v3, s3, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, s3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v5, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_bfe_u32 v3, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, v1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -2631,36 +2613,35 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s7, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s6, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX7-NEXT:    s_and_b32 s6, s0, 0xff
-; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX7-NEXT:    s_or_b32 s6, s6, s7
+; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s6, s0
+; GFX7-NEXT:    s_or_b32 s0, s5, s0
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_bfe_u32 s6, s1, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s1, 0xff
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX7-NEXT:    s_or_b32 s2, s2, s6
+; GFX7-NEXT:    s_or_b32 s2, s2, s5
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
 ; GFX7-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX7-NEXT:    v_lshl_b32_e32 v3, s2, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
@@ -2670,7 +2651,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, s5, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2887,37 +2868,36 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-LABEL: insertelement_s_v8i8_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s6, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX7-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_and_b32 s4, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX7-NEXT:    s_or_b32 s5, s5, s6
+; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s5, s0
+; GFX7-NEXT:    s_or_b32 s0, s4, s0
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_bfe_u32 s5, s1, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s4, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s1, 0xff
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX7-NEXT:    s_or_b32 s2, s2, s5
+; GFX7-NEXT:    s_or_b32 s2, s2, s4
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
 ; GFX7-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
@@ -2928,7 +2908,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -3131,64 +3111,62 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX7-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v5, s1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, s0, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xff, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v8, s0, v0
+; GFX7-NEXT:    v_bfe_u32 v8, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v10, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v10, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
-; GFX7-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v7, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v5, v0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -3360,64 +3338,62 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX7-NEXT:    s_lshr_b32 s1, s2, 2
-; GFX7-NEXT:    s_and_b32 s2, s2, 3
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
-; GFX7-NEXT:    s_lshl_b32 s2, 0xff, s2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX7-NEXT:    s_not_b32 s2, s2
+; GFX7-NEXT:    s_and_b32 s1, s2, 3
+; GFX7-NEXT:    s_lshr_b32 s0, s2, 2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v2
+; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_not_b32 s1, s1
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, s0, v0
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v8, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v0, v6, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v7, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v5, v0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -3588,62 +3564,60 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xff, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v8, s0, v0
+; GFX7-NEXT:    v_bfe_u32 v8, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v10, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v10, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
-; GFX7-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v3, v6, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v7, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, v5, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v5, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v4, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -4349,109 +4323,107 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX7-NEXT:    s_and_b32 s1, s3, 3
+; GFX7-NEXT:    s_and_b32 s0, s3, 3
 ; GFX7-NEXT:    s_lshr_b32 s4, s3, 2
-; GFX7-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX7-NEXT:    s_lshl_b32 s5, s2, s1
-; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
+; GFX7-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX7-NEXT:    s_lshl_b32 s5, s1, s0
+; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT:    s_not_b32 s6, s1
+; GFX7-NEXT:    s_not_b32 s6, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v9, s0, v0
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v11, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v2, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v13, v2, v4
+; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v16, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v15, v3, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v15, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
-; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX7-NEXT:    v_or_b32_e32 v3, v12, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v7
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v8
-; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[2:3]
-; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, s5, v5
+; GFX7-NEXT:    v_or_b32_e32 v11, v14, v15
+; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT:    v_or_b32_e32 v4, s5, v4
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v2, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v11, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v13, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v1
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
-; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -4757,7 +4729,6 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s5, s0, 24
@@ -4811,58 +4782,58 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xff, s4
 ; GFX7-NEXT:    s_andn2_b32 s4, s6, s4
-; GFX7-NEXT:    v_or_b32_e32 v5, s4, v0
+; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v0, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, v1, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, v2, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -5256,56 +5227,55 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v6, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v5, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0xff
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v0, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s11
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v1, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
-; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v2, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -5638,26 +5608,26 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
-; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX7-NEXT:    s_and_b32 s9, s0, 0xff
-; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX7-NEXT:    s_and_b32 s8, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s9, s9, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX7-NEXT:    s_or_b32 s9, s9, s10
+; GFX7-NEXT:    s_or_b32 s8, s8, s9
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s9, s0
+; GFX7-NEXT:    s_or_b32 s0, s8, s0
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX7-NEXT:    s_bfe_u32 s9, s1, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s8, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX7-NEXT:    s_or_b32 s4, s0, s4
 ; GFX7-NEXT:    s_and_b32 s0, s1, 0xff
-; GFX7-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX7-NEXT:    s_lshl_b32 s8, s8, 8
 ; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX7-NEXT:    s_or_b32 s0, s0, s9
+; GFX7-NEXT:    s_or_b32 s0, s0, s8
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s5, 24
@@ -5687,66 +5657,64 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_or_b32_e32 v6, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0xff
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v0, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v1, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
-; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v2, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -6038,109 +6006,107 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_mov_b32_e32 v7, 0xff
-; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 2, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 2, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX7-NEXT:    v_lshl_b32_e32 v18, s0, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v17
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xff, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v17
+; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v11, v3, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v13, v4, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v10, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v12, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
-; GFX7-NEXT:    v_and_b32_e32 v10, s0, v3
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v12, s0, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v15, v5, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
-; GFX7-NEXT:    v_and_b32_e32 v14, v5, v7
+; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v17, v6, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
-; GFX7-NEXT:    v_and_b32_e32 v16, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v6
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX7-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
-; GFX7-NEXT:    s_and_b32 s0, s2, 0xff
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
+; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
+; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshl_b32_e32 v19, s0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-NEXT:    v_or_b32_e32 v6, v13, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v18
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v2, v7
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v9
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[0:1]
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v18
-; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
 ; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v19
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v18
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[0:1]
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v0, v7
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v11, v1, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v3, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v13, v3, v7
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -6407,109 +6373,107 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_mov_b32_e32 v7, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; GFX7-NEXT:    s_and_b32 s0, s2, 3
 ; GFX7-NEXT:    s_lshr_b32 s4, s2, 2
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v7
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX7-NEXT:    s_not_b32 s5, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v11, v3, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v13, v4, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
-; GFX7-NEXT:    v_and_b32_e32 v10, s0, v3
+; GFX7-NEXT:    v_bfe_u32 v10, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v12, v4, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v4
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v12, s0, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v15, v5, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
-; GFX7-NEXT:    v_and_b32_e32 v14, v5, v7
+; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v17, v6, 8, 8
-; GFX7-NEXT:    s_and_b32 s0, s2, 3
+; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
-; GFX7-NEXT:    v_and_b32_e32 v16, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v6
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
-; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX7-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s0, v2
-; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    s_not_b32 s5, s0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-NEXT:    v_or_b32_e32 v6, v13, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v9
+; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
+; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v1, v2, vcc
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[0:1]
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
 ; GFX7-NEXT:    v_and_b32_e32 v5, s5, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[2:3]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v0, v7
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v11, v1, v7
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[0:1]
+; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v2, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v3, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v13, v3, v7
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v1, v8, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v9, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v2, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -6775,103 +6739,101 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 2, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v18
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v3, v8
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v19
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xff, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v18
 ; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v12, v4, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v4, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v5, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v5
-; GFX7-NEXT:    v_and_b32_e32 v11, s0, v4
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v13, s0, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
-; GFX7-NEXT:    v_and_b32_e32 v15, v6, v8
+; GFX7-NEXT:    v_bfe_u32 v15, v6, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v6
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v18, v7, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v17, v7, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX7-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
-; GFX7-NEXT:    v_and_b32_e32 v17, v7, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v7
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xff, v7
 ; GFX7-NEXT:    v_bfe_u32 v7, v7, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; GFX7-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
-; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX7-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
+; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX7-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX7-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX7-NEXT:    v_or_b32_e32 v7, v14, v7
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT:    v_or_b32_e32 v7, v13, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v10
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
 ; GFX7-NEXT:    v_and_b32_e32 v3, v6, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, v2, s[0:1]
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v5, v2, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v0, v8
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v11, v1, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v3, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v13, v3, v8
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, v4, v8
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index 3201adf3bd8d4..73aeb2296e683 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -508,10 +508,9 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s8
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16
-; GFX8-UNPACKED-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v3, s0, v1
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, s0, v2
+; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    ; return to shader part epilog
@@ -589,10 +588,9 @@ define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s8
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm d16
-; GFX8-UNPACKED-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 6a6b12108620f..c8e1881a21e94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -46,14 +46,14 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-NEXT:    s_movk_i32 s4, 0xff
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_and_or_b32 v0, v0, s4, v1
-; GFX906-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX906-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_b32_e32 v2, s4, v6
-; GFX906-NEXT:    v_and_b32_e32 v3, s4, v7
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v7
 ; GFX906-NEXT:    v_and_or_b32 v1, v4, s4, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index 6b28f32ba0739..f397be139a8d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -46,14 +46,14 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-NEXT:    s_movk_i32 s4, 0xff
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_and_or_b32 v0, v0, s4, v1
-; GFX906-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX906-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_b32_e32 v2, s4, v6
-; GFX906-NEXT:    v_and_b32_e32 v3, s4, v7
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v7
 ; GFX906-NEXT:    v_and_or_b32 v1, v4, s4, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 107a95da9d169..3905bbb0be4a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -8,9 +8,8 @@ define i8 @v_lshr_i8(i8 %value, i8 %amount) {
 ; GFX6-LABEL: v_lshr_i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -105,9 +104,8 @@ define i24 @v_lshr_i24(i24 %value, i24 %amount) {
 ; GCN-LABEL: v_lshr_i24:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0xffffff
-; GCN-NEXT:    v_and_b32_e32 v1, s4, v1
-; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -554,9 +552,8 @@ define i16 @v_lshr_i16(i16 %value, i16 %amount) {
 ; GFX6-LABEL: v_lshr_i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -685,12 +682,11 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-LABEL: v_lshr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -801,11 +797,10 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
 ; GFX6-LABEL: lshr_v2i16_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -838,12 +833,11 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: lshr_v2i16_vs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -887,18 +881,17 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-LABEL: v_lshr_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v4, v0
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v5
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v6
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1036,31 +1029,29 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-LABEL: v_lshr_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v8
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v9
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v9
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v8, v1
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v10
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v10
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v8, v2
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v11
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v11
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v8, v3
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v12
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v12
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v8, v4
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v13
-; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX6-NEXT:    v_mov_b32_e32 v16, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v13
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, v8, v5
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v14
-; GFX6-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, v8, v6
-; GFX6-NEXT:    v_and_b32_e32 v8, v15, v16
-; GFX6-NEXT:    v_and_b32_e32 v7, v7, v16
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v15
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, v8, v7
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index e5586787ac5b3..46068ac691447 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -38,9 +38,8 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
 ; GFX7-LABEL: v_mul_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -169,9 +168,8 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
 ; GFX7-LABEL: v_mul_i16_signext:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll
index 7eaa50b7d20d1..09aded6bc0f7c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll
@@ -94,9 +94,8 @@ define <2 x i16> @v_mul_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX8-LABEL: v_mul_v2i16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index e325c8b9af021..3ae07cc225647 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -559,12 +559,11 @@ define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 ; GFX6-LABEL: v_orn2_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -816,18 +815,17 @@ define <4 x i16> @v_orn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
 ; GFX6-LABEL: v_orn2_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, v4, v8
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX6-NEXT:    v_and_b32_e32 v4, v6, v8
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
index 0a2414d397cb9..21ad7cc108ede 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -608,19 +608,17 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
 ; GFX6-LABEL: v_roundeven_v2f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s6, 1
-; GFX6-NEXT:    v_and_b32_e32 v5, s6, v1
-; GFX6-NEXT:    s_mov_b32 s7, 0x43300000
+; GFX6-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
-; GFX6-NEXT:    v_or_b32_e32 v5, s7, v5
+; GFX6-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; GFX6-NEXT:    v_add_f64 v[6:7], v[0:1], v[4:5]
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
 ; GFX6-NEXT:    v_add_f64 v[5:6], v[6:7], -v[4:5]
 ; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX6-NEXT:    v_and_b32_e32 v5, s6, v3
-; GFX6-NEXT:    v_or_b32_e32 v5, s7, v5
+; GFX6-NEXT:    v_and_b32_e32 v5, 0x80000000, v3
+; GFX6-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; GFX6-NEXT:    v_add_f64 v[7:8], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX6-NEXT:    v_add_f64 v[4:5], v[7:8], -v[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 9bf8d106cf01d..d51228d25c721 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -259,10 +259,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -273,21 +272,19 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v5, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v4, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v5, s5, v5
-; GFX8-NEXT:    v_sub_u16_e32 v4, s4, v4
+; GFX8-NEXT:    v_sub_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
 ; GFX8-NEXT:    v_max_i16_e32 v1, v5, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
 ; GFX8-NEXT:    v_min_i16_e32 v4, 0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_i16_e32 v1, 0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v4, s5, v4
-; GFX8-NEXT:    v_sub_u16_e32 v1, s4, v1
+; GFX8-NEXT:    v_sub_u16_e32 v4, 0x8000, v4
+; GFX8-NEXT:    v_sub_u16_e32 v1, 0x7fff, v1
 ; GFX8-NEXT:    v_max_i16_e32 v2, v4, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
@@ -512,22 +509,21 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -540,33 +536,30 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_min_i16_e32 v10, 0, v0
+; GFX8-NEXT:    v_min_i16_e32 v9, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v8, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v10, s5, v10
-; GFX8-NEXT:    v_sub_u16_e32 v8, s4, v8
-; GFX8-NEXT:    v_max_i16_e32 v1, v10, v1
+; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_max_i16_e32 v1, v9, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
 ; GFX8-NEXT:    v_min_i16_e32 v8, 0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_i16_e32 v1, 0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v8, s5, v8
-; GFX8-NEXT:    v_sub_u16_e32 v1, s4, v1
+; GFX8-NEXT:    v_sub_u16_e32 v8, 0x8000, v8
+; GFX8-NEXT:    v_sub_u16_e32 v1, 0x7fff, v1
 ; GFX8-NEXT:    v_max_i16_e32 v2, v8, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
 ; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
 ; GFX8-NEXT:    v_min_i16_e32 v6, 0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v4, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v6, s5, v6
-; GFX8-NEXT:    v_sub_u16_e32 v4, v9, v4
+; GFX8-NEXT:    v_sub_u16_e32 v6, 0x8000, v6
+; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
 ; GFX8-NEXT:    v_max_i16_e32 v3, v6, v3
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u16_e32 v2, v2, v3
@@ -575,7 +568,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
 ; GFX8-NEXT:    v_max_i16_e32 v5, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v6, 0x8000, v6
-; GFX8-NEXT:    v_sub_u16_e32 v5, v9, v5
+; GFX8-NEXT:    v_sub_u16_e32 v5, 0x7fff, v5
 ; GFX8-NEXT:    v_max_i16_e32 v4, v6, v4
 ; GFX8-NEXT:    v_min_i16_e32 v4, v4, v5
 ; GFX8-NEXT:    v_add_u16_e32 v3, v3, v4
@@ -619,7 +612,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -853,7 +846,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
@@ -2699,19 +2692,17 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-LABEL: v_saddsat_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v4, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v3, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v4, s5, v4
-; GFX8-NEXT:    v_sub_u16_e32 v3, s4, v3
+; GFX8-NEXT:    v_sub_u16_e32 v4, 0x8000, v4
+; GFX8-NEXT:    v_sub_u16_e32 v3, 0x7fff, v3
 ; GFX8-NEXT:    v_max_i16_e32 v4, v4, v1
 ; GFX8-NEXT:    v_min_i16_e32 v5, 0, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, v4, v3
 ; GFX8-NEXT:    v_max_i16_e32 v4, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v5, s5, v5
-; GFX8-NEXT:    v_sub_u16_e32 v4, s4, v4
+; GFX8-NEXT:    v_sub_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
 ; GFX8-NEXT:    v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v3
@@ -2838,10 +2829,9 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2909,30 +2899,27 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: saddsat_v2i16_vs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s3, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v3, s3, v3
-; GFX8-NEXT:    v_sub_u16_e32 v2, s2, v2
+; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
+; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
 ; GFX8-NEXT:    v_max_i16_e32 v3, s0, v3
 ; GFX8-NEXT:    v_min_i16_e32 v4, 0, v1
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
 ; GFX8-NEXT:    v_max_i16_e32 v3, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v4, s3, v4
-; GFX8-NEXT:    v_sub_u16_e32 v3, s2, v3
+; GFX8-NEXT:    v_sub_u16_e32 v4, 0x8000, v4
+; GFX8-NEXT:    v_sub_u16_e32 v3, 0x7fff, v3
 ; GFX8-NEXT:    v_max_i16_e32 v4, s1, v4
 ; GFX8-NEXT:    v_min_i16_e32 v3, v4, v3
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v2
@@ -3009,17 +2996,16 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3027,32 +3013,30 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-LABEL: v_saddsat_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v7, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v6, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v7, s5, v7
-; GFX8-NEXT:    v_sub_u16_e32 v6, s4, v6
+; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_sub_u16_e32 v6, 0x7fff, v6
 ; GFX8-NEXT:    v_max_i16_e32 v7, v7, v2
 ; GFX8-NEXT:    v_min_i16_e32 v8, 0, v4
 ; GFX8-NEXT:    v_min_i16_e32 v6, v7, v6
 ; GFX8-NEXT:    v_max_i16_e32 v7, 0, v4
-; GFX8-NEXT:    v_sub_u16_e32 v8, s5, v8
-; GFX8-NEXT:    v_sub_u16_e32 v7, s4, v7
+; GFX8-NEXT:    v_sub_u16_e32 v8, 0x8000, v8
+; GFX8-NEXT:    v_sub_u16_e32 v7, 0x7fff, v7
 ; GFX8-NEXT:    v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v8, 0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_min_i16_e32 v2, v2, v7
 ; GFX8-NEXT:    v_max_i16_e32 v7, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v8, s5, v8
-; GFX8-NEXT:    v_sub_u16_e32 v7, s4, v7
+; GFX8-NEXT:    v_sub_u16_e32 v8, 0x8000, v8
+; GFX8-NEXT:    v_sub_u16_e32 v7, 0x7fff, v7
 ; GFX8-NEXT:    v_max_i16_e32 v8, v8, v3
 ; GFX8-NEXT:    v_min_i16_e32 v9, 0, v5
 ; GFX8-NEXT:    v_min_i16_e32 v7, v8, v7
 ; GFX8-NEXT:    v_max_i16_e32 v8, 0, v5
-; GFX8-NEXT:    v_sub_u16_e32 v9, s5, v9
-; GFX8-NEXT:    v_sub_u16_e32 v8, s4, v8
+; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
 ; GFX8-NEXT:    v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v8
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v6
@@ -3294,24 +3278,23 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
 ; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3319,56 +3302,52 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-LABEL: v_saddsat_v6i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_min_i16_e32 v11, 0, v0
+; GFX8-NEXT:    v_min_i16_e32 v10, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v9, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v11, s5, v11
-; GFX8-NEXT:    v_sub_u16_e32 v9, s4, v9
-; GFX8-NEXT:    v_max_i16_e32 v11, v11, v3
-; GFX8-NEXT:    v_min_i16_e32 v13, 0, v6
-; GFX8-NEXT:    v_min_i16_e32 v9, v11, v9
-; GFX8-NEXT:    v_max_i16_e32 v11, 0, v6
-; GFX8-NEXT:    v_sub_u16_e32 v13, s5, v13
-; GFX8-NEXT:    v_sub_u16_e32 v11, s4, v11
-; GFX8-NEXT:    v_max_i16_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v13, 0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v10, 0x8000, v10
+; GFX8-NEXT:    v_sub_u16_e32 v9, 0x7fff, v9
+; GFX8-NEXT:    v_max_i16_e32 v10, v10, v3
+; GFX8-NEXT:    v_min_i16_e32 v11, 0, v6
+; GFX8-NEXT:    v_min_i16_e32 v9, v10, v9
+; GFX8-NEXT:    v_max_i16_e32 v10, 0, v6
+; GFX8-NEXT:    v_sub_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v11, 0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_min_i16_e32 v3, v3, v11
-; GFX8-NEXT:    v_max_i16_e32 v11, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v13, s5, v13
-; GFX8-NEXT:    v_sub_u16_e32 v11, s4, v11
-; GFX8-NEXT:    v_max_i16_e32 v13, v13, v4
-; GFX8-NEXT:    v_min_i16_e32 v14, 0, v7
-; GFX8-NEXT:    v_min_i16_e32 v11, v13, v11
-; GFX8-NEXT:    v_max_i16_e32 v13, 0, v7
-; GFX8-NEXT:    v_sub_u16_e32 v14, s5, v14
-; GFX8-NEXT:    v_mov_b32_e32 v12, 0xffff8000
-; GFX8-NEXT:    v_sub_u16_e32 v13, s4, v13
-; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v14, 0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fff
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v13
-; GFX8-NEXT:    v_max_i16_e32 v13, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v14, v12, v14
+; GFX8-NEXT:    v_min_i16_e32 v3, v3, v10
+; GFX8-NEXT:    v_max_i16_e32 v10, 0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_max_i16_e32 v11, v11, v4
+; GFX8-NEXT:    v_min_i16_e32 v12, 0, v7
+; GFX8-NEXT:    v_min_i16_e32 v10, v11, v10
+; GFX8-NEXT:    v_max_i16_e32 v11, 0, v7
+; GFX8-NEXT:    v_sub_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_sub_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_max_i16_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v12, 0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_sub_u16_e32 v13, v10, v13
-; GFX8-NEXT:    v_max_i16_e32 v14, v14, v5
-; GFX8-NEXT:    v_min_i16_e32 v13, v14, v13
-; GFX8-NEXT:    v_max_i16_e32 v14, 0, v8
-; GFX8-NEXT:    v_sub_u16_e32 v10, v10, v14
-; GFX8-NEXT:    v_min_i16_e32 v14, 0, v8
-; GFX8-NEXT:    v_sub_u16_e32 v12, v12, v14
-; GFX8-NEXT:    v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v11
+; GFX8-NEXT:    v_max_i16_e32 v11, 0, v2
+; GFX8-NEXT:    v_sub_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_sub_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_max_i16_e32 v12, v12, v5
+; GFX8-NEXT:    v_min_i16_e32 v13, 0, v8
+; GFX8-NEXT:    v_min_i16_e32 v11, v12, v11
+; GFX8-NEXT:    v_max_i16_e32 v12, 0, v8
+; GFX8-NEXT:    v_sub_u16_e32 v13, 0x8000, v13
+; GFX8-NEXT:    v_sub_u16_e32 v12, 0x7fff, v12
+; GFX8-NEXT:    v_max_i16_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v9
 ; GFX8-NEXT:    v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v10
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v12
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_add_u16_e32 v1, v1, v11
+; GFX8-NEXT:    v_add_u16_e32 v1, v1, v10
 ; GFX8-NEXT:    v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_u16_e32 v2, v2, v13
+; GFX8-NEXT:    v_add_u16_e32 v2, v2, v11
 ; GFX8-NEXT:    v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3670,32 +3649,31 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
 ; GFX6-NEXT:    v_max_i32_e32 v9, 0, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v6
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3703,72 +3681,68 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-LABEL: v_saddsat_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_min_i16_e32 v14, 0, v0
+; GFX8-NEXT:    v_min_i16_e32 v13, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v12, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v14, s5, v14
-; GFX8-NEXT:    v_sub_u16_e32 v12, s4, v12
-; GFX8-NEXT:    v_max_i16_e32 v14, v14, v4
-; GFX8-NEXT:    v_min_i16_e32 v16, 0, v8
-; GFX8-NEXT:    v_min_i16_e32 v12, v14, v12
-; GFX8-NEXT:    v_max_i16_e32 v14, 0, v8
-; GFX8-NEXT:    v_sub_u16_e32 v16, s5, v16
-; GFX8-NEXT:    v_sub_u16_e32 v14, s4, v14
-; GFX8-NEXT:    v_max_i16_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v16, 0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v13, 0x8000, v13
+; GFX8-NEXT:    v_sub_u16_e32 v12, 0x7fff, v12
+; GFX8-NEXT:    v_max_i16_e32 v13, v13, v4
+; GFX8-NEXT:    v_min_i16_e32 v14, 0, v8
+; GFX8-NEXT:    v_min_i16_e32 v12, v13, v12
+; GFX8-NEXT:    v_max_i16_e32 v13, 0, v8
+; GFX8-NEXT:    v_sub_u16_e32 v14, 0x8000, v14
+; GFX8-NEXT:    v_sub_u16_e32 v13, 0x7fff, v13
+; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v14, 0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
-; GFX8-NEXT:    v_max_i16_e32 v14, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v16, s5, v16
-; GFX8-NEXT:    v_sub_u16_e32 v14, s4, v14
-; GFX8-NEXT:    v_max_i16_e32 v16, v16, v5
-; GFX8-NEXT:    v_min_i16_e32 v17, 0, v9
-; GFX8-NEXT:    v_min_i16_e32 v14, v16, v14
-; GFX8-NEXT:    v_max_i16_e32 v16, 0, v9
-; GFX8-NEXT:    v_sub_u16_e32 v17, s5, v17
-; GFX8-NEXT:    v_mov_b32_e32 v15, 0xffff8000
-; GFX8-NEXT:    v_sub_u16_e32 v16, s4, v16
-; GFX8-NEXT:    v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v17, 0, v2
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v13
+; GFX8-NEXT:    v_max_i16_e32 v13, 0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v14, 0x8000, v14
+; GFX8-NEXT:    v_sub_u16_e32 v13, 0x7fff, v13
+; GFX8-NEXT:    v_max_i16_e32 v14, v14, v5
+; GFX8-NEXT:    v_min_i16_e32 v15, 0, v9
+; GFX8-NEXT:    v_min_i16_e32 v13, v14, v13
+; GFX8-NEXT:    v_max_i16_e32 v14, 0, v9
+; GFX8-NEXT:    v_sub_u16_e32 v15, 0x8000, v15
+; GFX8-NEXT:    v_sub_u16_e32 v14, 0x7fff, v14
+; GFX8-NEXT:    v_max_i16_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v15, 0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0x7fff
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v16
-; GFX8-NEXT:    v_max_i16_e32 v16, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v17, v15, v17
-; GFX8-NEXT:    v_sub_u16_e32 v16, v13, v16
-; GFX8-NEXT:    v_max_i16_e32 v17, v17, v6
-; GFX8-NEXT:    v_min_i16_e32 v18, 0, v10
-; GFX8-NEXT:    v_min_i16_e32 v16, v17, v16
-; GFX8-NEXT:    v_max_i16_e32 v17, 0, v10
-; GFX8-NEXT:    v_sub_u16_e32 v18, v15, v18
-; GFX8-NEXT:    v_sub_u16_e32 v17, v13, v17
-; GFX8-NEXT:    v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v18, 0, v3
-; GFX8-NEXT:    v_min_i16_e32 v6, v6, v17
-; GFX8-NEXT:    v_max_i16_e32 v17, 0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v18, v15, v18
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v14
+; GFX8-NEXT:    v_max_i16_e32 v14, 0, v2
+; GFX8-NEXT:    v_sub_u16_e32 v15, 0x8000, v15
+; GFX8-NEXT:    v_sub_u16_e32 v14, 0x7fff, v14
+; GFX8-NEXT:    v_max_i16_e32 v15, v15, v6
+; GFX8-NEXT:    v_min_i16_e32 v16, 0, v10
+; GFX8-NEXT:    v_min_i16_e32 v14, v15, v14
+; GFX8-NEXT:    v_max_i16_e32 v15, 0, v10
+; GFX8-NEXT:    v_sub_u16_e32 v16, 0x8000, v16
+; GFX8-NEXT:    v_sub_u16_e32 v15, 0x7fff, v15
+; GFX8-NEXT:    v_max_i16_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v16, 0, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX8-NEXT:    v_sub_u16_e32 v17, v13, v17
-; GFX8-NEXT:    v_max_i16_e32 v18, v18, v7
-; GFX8-NEXT:    v_min_i16_e32 v17, v18, v17
-; GFX8-NEXT:    v_max_i16_e32 v18, 0, v11
-; GFX8-NEXT:    v_sub_u16_e32 v13, v13, v18
-; GFX8-NEXT:    v_min_i16_e32 v18, 0, v11
-; GFX8-NEXT:    v_sub_u16_e32 v15, v15, v18
+; GFX8-NEXT:    v_min_i16_e32 v6, v6, v15
+; GFX8-NEXT:    v_max_i16_e32 v15, 0, v3
+; GFX8-NEXT:    v_sub_u16_e32 v16, 0x8000, v16
+; GFX8-NEXT:    v_sub_u16_e32 v15, 0x7fff, v15
+; GFX8-NEXT:    v_max_i16_e32 v16, v16, v7
+; GFX8-NEXT:    v_min_i16_e32 v17, 0, v11
+; GFX8-NEXT:    v_min_i16_e32 v15, v16, v15
+; GFX8-NEXT:    v_max_i16_e32 v16, 0, v11
+; GFX8-NEXT:    v_sub_u16_e32 v17, 0x8000, v17
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v12
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_i16_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_sub_u16_e32 v16, 0x7fff, v16
+; GFX8-NEXT:    v_max_i16_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_add_u16_e32 v1, v1, v14
+; GFX8-NEXT:    v_add_u16_e32 v1, v1, v13
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_i16_e32 v7, v7, v13
+; GFX8-NEXT:    v_min_i16_e32 v7, v7, v16
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_add_u16_e32 v2, v2, v16
+; GFX8-NEXT:    v_add_u16_e32 v2, v2, v14
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_add_u16_e32 v3, v3, v17
+; GFX8-NEXT:    v_add_u16_e32 v3, v3, v15
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 2599327910eb5..78a0048443dc3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -615,20 +615,19 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-LABEL: v_sdiv_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_movk_i32 s4, 0x1000
+; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
+; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; GISEL-NEXT:    v_lshl_b32_e32 v2, s4, v2
-; GISEL-NEXT:    v_lshl_b32_e32 v3, s4, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
@@ -679,73 +678,72 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-LABEL: v_sdiv_v2i32_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s4, 0x1000
+; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
+; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_lshl_b32_e32 v2, s4, v2
-; CGP-NEXT:    v_lshl_b32_e32 v3, s4, v3
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v6
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v4, v6
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v7
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v4, v5, v7
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v7
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, 0, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v2
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v3
 ; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
-; CGP-NEXT:    v_rcp_f32_e32 v6, v6
-; CGP-NEXT:    v_rcp_f32_e32 v8, v8
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; CGP-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v9, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, 0, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT:    v_rcp_f32_e32 v5, v5
+; CGP-NEXT:    v_rcp_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_lo_u32 v6, v6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, v9, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, 0, v6
+; CGP-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; CGP-NEXT:    v_mul_lo_u32 v11, 0, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v9
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, v6, v2
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v7
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v9
+; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v0, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, 0, v6
+; CGP-NEXT:    v_mul_hi_u32 v6, v1, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_mul_lo_u32 v7, v5, v2
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v3
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v6
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v0, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e64 v9, s[6:7], v1, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v7
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v8
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
@@ -756,9 +754,8 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_sdiv_i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -792,9 +789,8 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
 ; CGP-LABEL: v_sdiv_i32_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v2, v2
@@ -829,11 +825,10 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_sdiv_v2i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
-; GISEL-NEXT:    v_and_b32_e32 v2, s4, v2
-; GISEL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
@@ -895,11 +890,10 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-LABEL: v_sdiv_v2i32_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
-; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
-; CGP-NEXT:    v_and_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index f92b02bd7b6e0..81fd69bed5d4f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1042,34 +1042,33 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_sdiv_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0x1000
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_movk_i32 s5, 0xf000
-; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
+; CHECK-NEXT:    s_movk_i32 s4, 0xf000
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
@@ -1090,9 +1089,10 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    s_movk_i32 s4, 0x1000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
@@ -1469,34 +1469,34 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s6, 0x1000
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_movk_i32 s7, 0xf000
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x1000
+; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; CGP-NEXT:    s_movk_i32 s6, 0xf000
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v6
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, s7, v6
-; CGP-NEXT:    v_mul_hi_u32 v10, s7, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s7, v5
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, s6, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v5
+; CGP-NEXT:    s_movk_i32 s7, 0x1000
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1517,9 +1517,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, s7, v6
-; CGP-NEXT:    v_mul_hi_u32 v10, s7, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s7, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, s6, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v5
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v9
@@ -1568,9 +1568,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_mul_lo_u32 v8, 0, v7
-; CGP-NEXT:    v_mul_lo_u32 v9, s6, v6
-; CGP-NEXT:    v_mul_hi_u32 v11, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v9, s7, v6
+; CGP-NEXT:    v_mul_hi_u32 v11, s7, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, s7, v7
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
@@ -1596,7 +1596,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; CGP-NEXT:    v_cvt_f32_u32_e32 v9, v5
+; CGP-NEXT:    v_cvt_f32_u32_e32 v9, 0x1000
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -1614,9 +1614,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -1645,9 +1645,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -1699,9 +1699,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_mul_lo_u32 v8, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v9, s6, v6
-; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v9, s7, v6
+; CGP-NEXT:    v_mul_hi_u32 v11, s7, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, s7, v4
 ; CGP-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1745,34 +1745,33 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_sdiv_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_mov_b32 s5, 0xffed2705
-; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0xffed2705
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
@@ -1793,9 +1792,10 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
@@ -2172,34 +2172,34 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s6, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_mov_b32 s7, 0xffed2705
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
+; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; CGP-NEXT:    s_mov_b32 s6, 0xffed2705
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v6
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, s7, v6
-; CGP-NEXT:    v_mul_hi_u32 v10, s7, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s7, v5
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, s6, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v5
+; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -2220,9 +2220,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, s7, v6
-; CGP-NEXT:    v_mul_hi_u32 v10, s7, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s7, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, s6, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v5
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v9
@@ -2271,9 +2271,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_mul_lo_u32 v8, 0, v7
-; CGP-NEXT:    v_mul_lo_u32 v9, s6, v6
-; CGP-NEXT:    v_mul_hi_u32 v11, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v9, s7, v6
+; CGP-NEXT:    v_mul_hi_u32 v11, s7, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, s7, v7
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
@@ -2299,7 +2299,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; CGP-NEXT:    v_cvt_f32_u32_e32 v9, v5
+; CGP-NEXT:    v_cvt_f32_u32_e32 v9, 0x12d8fb
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -2317,9 +2317,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -2348,9 +2348,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -2402,9 +2402,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_mul_lo_u32 v8, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v9, s6, v6
-; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v9, s7, v6
+; CGP-NEXT:    v_mul_hi_u32 v11, s7, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, s7, v4
 ; CGP-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -3298,11 +3298,10 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-LABEL: v_sdiv_i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v2
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -3326,10 +3325,9 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-LABEL: v_sdiv_i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; CGP-NEXT:    v_rcp_f32_e32 v2, v1
 ; CGP-NEXT:    v_mul_f32_e32 v2, v0, v2
@@ -3352,8 +3350,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_sdiv_v2i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s6, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v1, s6, v4
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v1
@@ -3362,8 +3359,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_and_b32_e32 v5, s6, v0
-; GISEL-NEXT:    v_and_b32_e32 v6, s6, v6
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
@@ -3378,7 +3375,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v10
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v2
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v7, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
@@ -3624,15 +3621,14 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_sdiv_v2i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v4
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; CGP-NEXT:    v_and_b32_e32 v4, s4, v6
+; CGP-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
 ; CGP-NEXT:    v_rcp_f32_e32 v3, v1
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v3, v0, v3
 ; CGP-NEXT:    v_trunc_f32_e32 v3, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 3120da6cfc4c9..5c79eb9618a2b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -875,7 +875,6 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-LABEL: sdivrem_v4i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s12, 31
@@ -887,113 +886,113 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-NEXT:    s_add_i32 s0, s13, s16
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_xor_b32 s13, s0, s16
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s13
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s13
 ; GFX8-NEXT:    s_ashr_i32 s12, s8, 31
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_add_i32 s0, s8, s12
 ; GFX8-NEXT:    s_xor_b32 s0, s0, s12
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
+; GFX8-NEXT:    s_sub_i32 s8, 0, s13
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, v3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v3, v0, s3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s0, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
-; GFX8-NEXT:    s_sub_i32 s0, 0, s13
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    s_xor_b32 s0, s12, s2
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX8-NEXT:    s_add_i32 s1, s9, s2
-; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_xor_b32_e32 v3, s12, v3
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
+; GFX8-NEXT:    v_xor_b32_e32 v2, s12, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX8-NEXT:    s_ashr_i32 s3, s14, 31
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v1, s13
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s12, v3
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s13
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s12, v2
 ; GFX8-NEXT:    s_add_i32 s0, s14, s3
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v5
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
 ; GFX8-NEXT:    s_xor_b32 s8, s0, s3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s13, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s8
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s13, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s13, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s13, v2
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s8
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, s0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v5, s0, v3
 ; GFX8-NEXT:    s_ashr_i32 s9, s10, 31
 ; GFX8-NEXT:    s_add_i32 s1, s10, s9
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s9
-; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GFX8-NEXT:    s_xor_b32 s0, s2, s16
-; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, s2, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
-; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v5
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s2, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s2, v2
 ; GFX8-NEXT:    s_ashr_i32 s2, s15, 31
-; GFX8-NEXT:    v_mul_lo_u32 v7, v6, s8
+; GFX8-NEXT:    v_mul_lo_u32 v6, v3, s8
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
 ; GFX8-NEXT:    s_add_i32 s0, s15, s2
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v7
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v6
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
 ; GFX8-NEXT:    s_xor_b32 s10, s0, s2
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, s10
-; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s8, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v7, v2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s8, v3
-; GFX8-NEXT:    s_sub_i32 s0, 0, s10
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, s10
+; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s8, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s0, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s8, v2
+; GFX8-NEXT:    s_sub_i32 s0, 0, s10
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v2, v7, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v6
 ; GFX8-NEXT:    s_xor_b32 s0, s9, s3
 ; GFX8-NEXT:    s_ashr_i32 s3, s11, 31
 ; GFX8-NEXT:    s_add_i32 s1, s11, s3
-; GFX8-NEXT:    v_mul_hi_u32 v7, v2, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v2
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s3
-; GFX8-NEXT:    v_xor_b32_e32 v6, s0, v6
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, s1, v2
-; GFX8-NEXT:    v_xor_b32_e32 v3, s9, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v6
-; GFX8-NEXT:    v_mul_lo_u32 v8, v7, s10
+; GFX8-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
+; GFX8-NEXT:    v_mul_hi_u32 v8, s1, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, s9, v7
+; GFX8-NEXT:    v_mul_lo_u32 v7, v8, s10
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s9, v3
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v8
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v7
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v7
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v8
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s10, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v7
@@ -1017,138 +1016,137 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-LABEL: sdivrem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s6, s12, 31
 ; GFX9-NEXT:    s_add_i32 s0, s12, s6
 ; GFX9-NEXT:    s_xor_b32 s7, s0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_ashr_i32 s5, s13, 31
-; GFX9-NEXT:    s_add_i32 s12, s13, s5
+; GFX9-NEXT:    s_ashr_i32 s4, s13, 31
+; GFX9-NEXT:    s_add_i32 s5, s13, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s12, s12, s5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX9-NEXT:    s_xor_b32 s5, s5, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GFX9-NEXT:    s_sub_i32 s13, 0, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_ashr_i32 s4, s8, 31
-; GFX9-NEXT:    s_add_i32 s8, s8, s4
-; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    s_ashr_i32 s12, s8, 31
+; GFX9-NEXT:    s_add_i32 s8, s8, s12
+; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_xor_b32 s8, s8, s4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    s_sub_i32 s16, 0, s12
+; GFX9-NEXT:    s_xor_b32 s8, s8, s12
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    s_sub_i32 s13, 0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v1
 ; GFX9-NEXT:    s_ashr_i32 s13, s9, 31
-; GFX9-NEXT:    s_add_i32 s9, s9, s13
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s16, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
+; GFX9-NEXT:    s_add_i32 s9, s9, s13
 ; GFX9-NEXT:    s_xor_b32 s9, s9, s13
-; GFX9-NEXT:    s_xor_b32 s6, s4, s6
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s7
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    s_xor_b32 s5, s13, s5
-; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v4
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s7
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s12
-; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v3
+; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s7, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v2
+; GFX9-NEXT:    s_xor_b32 s6, s12, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s4, v3
-; GFX9-NEXT:    s_ashr_i32 s4, s14, 31
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s5
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
-; GFX9-NEXT:    s_add_i32 s6, s14, s4
-; GFX9-NEXT:    s_xor_b32 s6, s6, s4
-; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s12, v3
+; GFX9-NEXT:    s_ashr_i32 s6, s14, 31
+; GFX9-NEXT:    s_add_i32 s7, s14, s6
+; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
+; GFX9-NEXT:    s_xor_b32 s7, s7, s6
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s12, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s7
+; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
+; GFX9-NEXT:    s_sub_i32 s8, 0, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, s8, v3
+; GFX9-NEXT:    s_xor_b32 s4, s13, s4
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
+; GFX9-NEXT:    s_ashr_i32 s4, s15, 31
+; GFX9-NEXT:    s_add_i32 s9, s15, s4
+; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX9-NEXT:    s_xor_b32 s9, s9, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s9
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v2
+; GFX9-NEXT:    s_ashr_i32 s5, s10, 31
+; GFX9-NEXT:    s_add_i32 s8, s10, s5
+; GFX9-NEXT:    s_xor_b32 s8, s8, s5
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, s13, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s7
+; GFX9-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s13, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s8, v6
+; GFX9-NEXT:    s_sub_i32 s8, 0, s9
+; GFX9-NEXT:    v_mul_lo_u32 v8, s8, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
-; GFX9-NEXT:    s_sub_i32 s7, 0, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, s7, v5
-; GFX9-NEXT:    v_xor_b32_e32 v1, s5, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s5, v1
-; GFX9-NEXT:    s_ashr_i32 s5, s15, 31
-; GFX9-NEXT:    s_add_i32 s9, s15, s5
-; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX9-NEXT:    s_xor_b32 s9, s9, s5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s9
-; GFX9-NEXT:    s_ashr_i32 s7, s10, 31
-; GFX9-NEXT:    s_add_i32 s8, s10, s7
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v2
+; GFX9-NEXT:    s_ashr_i32 s7, s11, 31
+; GFX9-NEXT:    s_add_i32 s8, s11, s7
 ; GFX9-NEXT:    s_xor_b32 s8, s8, s7
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s12, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v6, s6
-; GFX9-NEXT:    v_mul_f32_e32 v2, v8, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_xor_b32_e32 v3, s13, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s13, v3
-; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v7
-; GFX9-NEXT:    s_sub_i32 s8, 0, s9
-; GFX9-NEXT:    v_mul_lo_u32 v8, s8, v2
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v6
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v6
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s6, v3
-; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
-; GFX9-NEXT:    s_add_i32 s8, s11, s6
-; GFX9-NEXT:    s_xor_b32 s8, s8, s6
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v8, s8, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
+; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v7
+; GFX9-NEXT:    s_xor_b32 s6, s5, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v3
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, s9
+; GFX9-NEXT:    v_add_u32_e32 v8, 1, v7
 ; GFX9-NEXT:    s_xor_b32 s4, s7, s4
-; GFX9-NEXT:    v_xor_b32_e32 v3, s7, v3
-; GFX9-NEXT:    v_mul_lo_u32 v7, v8, s9
-; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v7
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v8
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v8, 1, v7
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, s9, v3
-; GFX9-NEXT:    s_xor_b32 s4, s6, s5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v7
+; GFX9-NEXT:    v_xor_b32_e32 v6, s5, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v8
+; GFX9-NEXT:    v_xor_b32_e32 v7, s7, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s6, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s7, v7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3]
@@ -2473,8 +2471,7 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2483,7 +2480,7 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_short v[0:1], v4
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -2558,9 +2555,8 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -2570,7 +2566,7 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2884,13 +2880,12 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3225,51 +3220,51 @@ define amdgpu_kernel void @sdivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %
 define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)* %out1, i27 %x, i27 %y) {
 ; GFX8-LABEL: sdivrem_i27:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX8-NEXT:    s_mov_b32 s9, 0x7ffffff
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x1b0000
-; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX8-NEXT:    s_add_i32 s1, s1, s2
-; GFX8-NEXT:    s_xor_b32 s3, s1, s2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX8-NEXT:    s_sub_i32 s1, 0, s3
-; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x1b0000
-; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
+; GFX8-NEXT:    s_bfe_i32 s0, s7, 0x1b0000
+; GFX8-NEXT:    s_ashr_i32 s7, s0, 31
+; GFX8-NEXT:    s_add_i32 s0, s0, s7
+; GFX8-NEXT:    s_xor_b32 s8, s0, s7
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX8-NEXT:    s_sub_i32 s0, 0, s8
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_add_i32 s0, s0, s8
-; GFX8-NEXT:    s_xor_b32 s0, s0, s8
-; GFX8-NEXT:    s_xor_b32 s2, s8, s2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT:    s_bfe_i32 s4, s6, 0x1b0000
+; GFX8-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    s_add_i32 s4, s4, s5
+; GFX8-NEXT:    s_xor_b32 s4, s4, s5
+; GFX8-NEXT:    s_xor_b32 s6, s5, s7
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, v0, s3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v2, s[0:1], s3, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v2, s[0:1], s3, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s2, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, s8, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s8, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, s9, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    flat_store_dword v[0:1], v3
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_and_b32_e32 v2, s9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s8, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s8, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, s5, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7ffffff, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s5, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7ffffff, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3293,8 +3288,7 @@ define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_xor_b32 s5, s8, s6
-; GFX9-NEXT:    s_mov_b32 s4, 0x7ffffff
+; GFX9-NEXT:    s_xor_b32 s4, s8, s6
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
@@ -3310,14 +3304,14 @@ define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s5, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s5, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v1
 ; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 97a09585b73e5..01020cd2c42fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -401,10 +401,9 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
 ; GFX7-LABEL: v_shl_v2i64_zext_v2i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_brev_b32 s4, -4
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x3fffffff, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
 ; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], 2
@@ -413,10 +412,9 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
 ; GFX8-LABEL: v_shl_v2i64_zext_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, -4
-; GFX8-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x3fffffff, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
@@ -425,10 +423,9 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
 ; GFX9-LABEL: v_shl_v2i64_zext_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_brev_b32 s4, -4
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x3fffffff, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
@@ -485,9 +482,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GFX7-LABEL: v_shl_v2i64_sext_v2i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_brev_b32 s4, -8
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x1fffffff, v1
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
@@ -497,9 +493,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GFX8-LABEL: v_shl_v2i64_sext_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, -8
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x1fffffff, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
@@ -509,9 +504,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GFX9-LABEL: v_shl_v2i64_sext_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_brev_b32 s4, -8
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x1fffffff, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
@@ -659,13 +653,12 @@ define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
 ; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x3fff3fff, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 15173a7d5cea0..5a366388ab8e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -718,10 +718,9 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-LABEL: v_shl_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -830,13 +829,12 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
 define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
 ; GFX6-LABEL: shl_v2i16_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -870,10 +868,9 @@ define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -916,21 +913,20 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-LABEL: v_shl_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v5
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1061,38 +1057,36 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-LABEL: v_shl_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v9
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v9
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v8, v1
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v10
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v10
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v8, v2
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v11
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v11
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v8, v3
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v12
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v12
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v13
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v16, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v13
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, v8, v5
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v14
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, v8, v6
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v15
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v15
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, v2, v16
-; GFX6-NEXT:    v_and_b32_e32 v2, v3, v16
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, v8, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, v5, v16
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v4, v16
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v7, v16
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, v6, v16
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index f4eb1437cce30..b902dc471e37a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -568,19 +568,18 @@ define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-LABEL: v_srem_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_movk_i32 s4, 0x1000
+; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
+; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; GISEL-NEXT:    v_lshl_b32_e32 v2, s4, v2
-; GISEL-NEXT:    v_lshl_b32_e32 v3, s4, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v2
@@ -626,19 +625,18 @@ define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-LABEL: v_srem_v2i32_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s4, 0x1000
+; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
+; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_lshl_b32_e32 v2, s4, v2
-; CGP-NEXT:    v_lshl_b32_e32 v3, s4, v3
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
@@ -697,9 +695,8 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_srem_i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -730,9 +727,8 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
 ; CGP-LABEL: v_srem_i32_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v2, v2
@@ -765,11 +761,10 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_srem_v2i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
-; GISEL-NEXT:    v_and_b32_e32 v2, s4, v2
-; GISEL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
@@ -825,11 +820,10 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-LABEL: v_srem_v2i32_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
-; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
-; CGP-NEXT:    v_and_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 33d88ad985283..a27ef9ba086cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -1022,34 +1022,33 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_srem_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0x1000
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_movk_i32 s5, 0xf000
-; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
+; CHECK-NEXT:    s_movk_i32 s4, 0xf000
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
@@ -1070,9 +1069,10 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    s_movk_i32 s4, 0x1000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
@@ -1445,34 +1445,34 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s6, 0x1000
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_movk_i32 s7, 0xf000
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x1000
+; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; CGP-NEXT:    s_movk_i32 s6, 0xf000
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v6
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, s7, v6
-; CGP-NEXT:    v_mul_hi_u32 v10, s7, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s7, v5
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, s6, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v5
+; CGP-NEXT:    s_movk_i32 s7, 0x1000
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1493,9 +1493,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, s7, v6
-; CGP-NEXT:    v_mul_hi_u32 v10, s7, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s7, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, s6, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v5
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v9
@@ -1544,9 +1544,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_mul_lo_u32 v8, 0, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, s6, v6
-; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v6, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v9, s7, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, s7, v7
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
@@ -1569,7 +1569,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT:    v_cvt_f32_u32_e32 v9, v5
+; CGP-NEXT:    v_cvt_f32_u32_e32 v9, 0x1000
 ; CGP-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
@@ -1587,9 +1587,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -1618,9 +1618,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v7
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -1673,9 +1673,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_mul_lo_u32 v8, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, s6, v6
-; CGP-NEXT:    v_mul_lo_u32 v9, s6, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v6, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v9, s7, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, s7, v4
 ; CGP-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
@@ -1717,34 +1717,33 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_srem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_mov_b32 s5, 0xffed2705
-; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0xffed2705
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
@@ -1765,9 +1764,10 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
@@ -2140,34 +2140,34 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s6, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_mov_b32 s7, 0xffed2705
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
+; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; CGP-NEXT:    s_mov_b32 s6, 0xffed2705
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v6
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, s7, v6
-; CGP-NEXT:    v_mul_hi_u32 v10, s7, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s7, v5
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, s6, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v5
+; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -2188,9 +2188,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v7, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, s7, v6
-; CGP-NEXT:    v_mul_hi_u32 v10, s7, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s7, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, s6, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v5
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v9
@@ -2239,9 +2239,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_mul_lo_u32 v8, 0, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, s6, v6
-; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v6, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v9, s7, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, s7, v7
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
@@ -2264,7 +2264,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT:    v_cvt_f32_u32_e32 v9, v5
+; CGP-NEXT:    v_cvt_f32_u32_e32 v9, 0x12d8fb
 ; CGP-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
@@ -2282,9 +2282,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -2313,9 +2313,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v7
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -2368,9 +2368,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_mul_lo_u32 v8, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, s6, v6
-; CGP-NEXT:    v_mul_lo_u32 v9, s6, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v6, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v9, s7, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, s7, v4
 ; CGP-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
@@ -3246,11 +3246,10 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-LABEL: v_srem_i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v2
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -3272,10 +3271,9 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-LABEL: v_srem_i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v2, v1
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v3, v0
 ; CGP-NEXT:    v_rcp_f32_e32 v4, v2
 ; CGP-NEXT:    v_mul_f32_e32 v4, v3, v4
@@ -3300,8 +3298,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_srem_v2i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s6, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v1, s6, v4
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v1
@@ -3310,8 +3307,8 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_and_b32_e32 v5, s6, v0
-; GISEL-NEXT:    v_and_b32_e32 v6, s6, v6
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
@@ -3326,7 +3323,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v10
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v2
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v7, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
@@ -3570,14 +3567,13 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_srem_v2i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v4
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v3, v1
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v4, v0
-; CGP-NEXT:    v_and_b32_e32 v6, s4, v6
+; CGP-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; CGP-NEXT:    v_rcp_f32_e32 v5, v3
-; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; CGP-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v5, v5
 ; CGP-NEXT:    v_mad_f32 v4, -v5, v3, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 95e1c42572dd3..78b620ec8f8d5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -259,10 +259,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -273,21 +272,19 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_subrev_u16_e32 v4, s4, v4
+; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
 ; GFX8-NEXT:    v_min_i16_e32 v5, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v5, s5, v5
+; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x8000, v5
 ; GFX8-NEXT:    v_max_i16_e32 v1, v4, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v1, s4, v1
+; GFX8-NEXT:    v_subrev_u16_e32 v1, 0x7fff, v1
 ; GFX8-NEXT:    v_min_i16_e32 v4, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v4, s5, v4
+; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x8000, v4
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
@@ -512,22 +509,21 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -540,40 +536,37 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v8, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_subrev_u16_e32 v8, s4, v8
-; GFX8-NEXT:    v_min_i16_e32 v10, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v10, s5, v10
+; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_min_i16_e32 v9, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
 ; GFX8-NEXT:    v_max_i16_e32 v1, v8, v1
-; GFX8-NEXT:    v_min_i16_e32 v1, v1, v10
+; GFX8-NEXT:    v_min_i16_e32 v1, v1, v9
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v1, s4, v1
+; GFX8-NEXT:    v_subrev_u16_e32 v1, 0x7fff, v1
 ; GFX8-NEXT:    v_min_i16_e32 v8, -1, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x8000, v8
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
-; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fff
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
-; GFX8-NEXT:    v_sub_u16_e32 v4, v4, v9
+; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
 ; GFX8-NEXT:    v_min_i16_e32 v6, -1, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v6, s5, v6
+; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x8000, v6
 ; GFX8-NEXT:    v_max_i16_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v6
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
 ; GFX8-NEXT:    v_max_i16_e32 v5, -1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
-; GFX8-NEXT:    v_sub_u16_e32 v5, v5, v9
+; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x7fff, v5
 ; GFX8-NEXT:    v_min_i16_e32 v6, -1, v3
 ; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x8000, v6
 ; GFX8-NEXT:    v_max_i16_e32 v4, v5, v4
@@ -619,7 +612,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -853,7 +846,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
@@ -2685,19 +2678,17 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v3, -1, v0
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_subrev_u16_e32 v3, s4, v3
+; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x7fff, v3
 ; GFX8-NEXT:    v_min_i16_e32 v4, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v4, s5, v4
+; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x8000, v4
 ; GFX8-NEXT:    v_max_i16_e32 v3, v3, v1
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v4, s4, v4
+; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
 ; GFX8-NEXT:    v_min_i16_e32 v5, -1, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v5, s5, v5
+; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x8000, v5
 ; GFX8-NEXT:    v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v3
@@ -2824,10 +2815,9 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2895,30 +2885,27 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: ssubsat_v2i16_vs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
-; GFX8-NEXT:    s_movk_i32 s3, 0x8000
-; GFX8-NEXT:    v_subrev_u16_e32 v2, s2, v2
+; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v3, s3, v3
+; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_e32 v2, s0, v2
 ; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_max_i16_e32 v3, -1, v1
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    v_subrev_u16_e32 v3, s2, v3
+; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x7fff, v3
 ; GFX8-NEXT:    v_min_i16_e32 v4, -1, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v4, s3, v4
+; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x8000, v4
 ; GFX8-NEXT:    v_max_i16_e32 v3, s1, v3
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v2
@@ -2995,17 +2982,16 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3013,32 +2999,30 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v6, -1, v0
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_subrev_u16_e32 v6, s4, v6
+; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x7fff, v6
 ; GFX8-NEXT:    v_min_i16_e32 v7, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v7, s5, v7
+; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
 ; GFX8-NEXT:    v_max_i16_e32 v6, v6, v2
 ; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
 ; GFX8-NEXT:    v_max_i16_e32 v7, -1, v4
-; GFX8-NEXT:    v_subrev_u16_e32 v7, s4, v7
+; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x7fff, v7
 ; GFX8-NEXT:    v_min_i16_e32 v8, -1, v4
-; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x8000, v8
 ; GFX8-NEXT:    v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v7, -1, v1
 ; GFX8-NEXT:    v_min_i16_e32 v2, v2, v8
-; GFX8-NEXT:    v_subrev_u16_e32 v7, s4, v7
+; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x7fff, v7
 ; GFX8-NEXT:    v_min_i16_e32 v8, -1, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x8000, v8
 ; GFX8-NEXT:    v_max_i16_e32 v7, v7, v3
 ; GFX8-NEXT:    v_min_i16_e32 v7, v7, v8
 ; GFX8-NEXT:    v_max_i16_e32 v8, -1, v5
-; GFX8-NEXT:    v_subrev_u16_e32 v8, s4, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
 ; GFX8-NEXT:    v_min_i16_e32 v9, -1, v5
-; GFX8-NEXT:    v_subrev_u16_e32 v9, s5, v9
+; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
 ; GFX8-NEXT:    v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v9
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v6
@@ -3280,24 +3264,23 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3305,56 +3288,52 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v6i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v9, -1, v0
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_subrev_u16_e32 v9, s4, v9
-; GFX8-NEXT:    v_min_i16_e32 v11, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x7fff, v9
+; GFX8-NEXT:    v_min_i16_e32 v10, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v11, s5, v11
+; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x8000, v10
 ; GFX8-NEXT:    v_max_i16_e32 v9, v9, v3
-; GFX8-NEXT:    v_min_i16_e32 v9, v9, v11
-; GFX8-NEXT:    v_max_i16_e32 v11, -1, v6
-; GFX8-NEXT:    v_subrev_u16_e32 v11, s4, v11
-; GFX8-NEXT:    v_min_i16_e32 v13, -1, v6
-; GFX8-NEXT:    v_subrev_u16_e32 v13, s5, v13
-; GFX8-NEXT:    v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v11, -1, v1
-; GFX8-NEXT:    v_min_i16_e32 v3, v3, v13
-; GFX8-NEXT:    v_subrev_u16_e32 v11, s4, v11
-; GFX8-NEXT:    v_min_i16_e32 v13, -1, v1
+; GFX8-NEXT:    v_min_i16_e32 v9, v9, v10
+; GFX8-NEXT:    v_max_i16_e32 v10, -1, v6
+; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_min_i16_e32 v11, -1, v6
+; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_max_i16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v10, -1, v1
+; GFX8-NEXT:    v_min_i16_e32 v3, v3, v11
+; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_min_i16_e32 v11, -1, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v13, s5, v13
-; GFX8-NEXT:    v_max_i16_e32 v11, v11, v4
-; GFX8-NEXT:    v_min_i16_e32 v11, v11, v13
-; GFX8-NEXT:    v_max_i16_e32 v13, -1, v7
-; GFX8-NEXT:    v_subrev_u16_e32 v13, s4, v13
-; GFX8-NEXT:    v_min_i16_e32 v14, -1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fff
-; GFX8-NEXT:    v_subrev_u16_e32 v14, s5, v14
-; GFX8-NEXT:    v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v13, -1, v2
-; GFX8-NEXT:    v_mov_b32_e32 v12, 0xffff8000
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
-; GFX8-NEXT:    v_sub_u16_e32 v13, v13, v10
-; GFX8-NEXT:    v_min_i16_e32 v14, -1, v2
+; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_max_i16_e32 v10, v10, v4
+; GFX8-NEXT:    v_min_i16_e32 v10, v10, v11
+; GFX8-NEXT:    v_max_i16_e32 v11, -1, v7
+; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_min_i16_e32 v12, -1, v7
+; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v11, -1, v2
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v12
+; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_min_i16_e32 v12, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_sub_u16_e32 v14, v14, v12
-; GFX8-NEXT:    v_max_i16_e32 v13, v13, v5
-; GFX8-NEXT:    v_min_i16_e32 v13, v13, v14
-; GFX8-NEXT:    v_max_i16_e32 v14, -1, v8
-; GFX8-NEXT:    v_sub_u16_e32 v10, v14, v10
-; GFX8-NEXT:    v_min_i16_e32 v14, -1, v8
-; GFX8-NEXT:    v_sub_u16_e32 v12, v14, v12
-; GFX8-NEXT:    v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_max_i16_e32 v11, v11, v5
+; GFX8-NEXT:    v_min_i16_e32 v11, v11, v12
+; GFX8-NEXT:    v_max_i16_e32 v12, -1, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x7fff, v12
+; GFX8-NEXT:    v_min_i16_e32 v13, -1, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x8000, v13
+; GFX8-NEXT:    v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v9
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v12
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v13
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v11
+; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v10
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v13
+; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v11
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3656,32 +3635,31 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v7
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v6
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3689,72 +3667,68 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v12, -1, v0
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    v_subrev_u16_e32 v12, s4, v12
-; GFX8-NEXT:    v_min_i16_e32 v14, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x7fff, v12
+; GFX8-NEXT:    v_min_i16_e32 v13, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v14, s5, v14
+; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x8000, v13
 ; GFX8-NEXT:    v_max_i16_e32 v12, v12, v4
-; GFX8-NEXT:    v_min_i16_e32 v12, v12, v14
-; GFX8-NEXT:    v_max_i16_e32 v14, -1, v8
-; GFX8-NEXT:    v_subrev_u16_e32 v14, s4, v14
-; GFX8-NEXT:    v_min_i16_e32 v16, -1, v8
-; GFX8-NEXT:    v_subrev_u16_e32 v16, s5, v16
-; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v14, -1, v1
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v16
-; GFX8-NEXT:    v_subrev_u16_e32 v14, s4, v14
-; GFX8-NEXT:    v_min_i16_e32 v16, -1, v1
+; GFX8-NEXT:    v_min_i16_e32 v12, v12, v13
+; GFX8-NEXT:    v_max_i16_e32 v13, -1, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x7fff, v13
+; GFX8-NEXT:    v_min_i16_e32 v14, -1, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v14, 0x8000, v14
+; GFX8-NEXT:    v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v13, -1, v1
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
+; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x7fff, v13
+; GFX8-NEXT:    v_min_i16_e32 v14, -1, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v16, s5, v16
-; GFX8-NEXT:    v_max_i16_e32 v14, v14, v5
-; GFX8-NEXT:    v_min_i16_e32 v14, v14, v16
-; GFX8-NEXT:    v_max_i16_e32 v16, -1, v9
-; GFX8-NEXT:    v_subrev_u16_e32 v16, s4, v16
-; GFX8-NEXT:    v_min_i16_e32 v17, -1, v9
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0x7fff
-; GFX8-NEXT:    v_subrev_u16_e32 v17, s5, v17
-; GFX8-NEXT:    v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v16, -1, v2
-; GFX8-NEXT:    v_mov_b32_e32 v15, 0xffff8000
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v17
-; GFX8-NEXT:    v_sub_u16_e32 v16, v16, v13
-; GFX8-NEXT:    v_min_i16_e32 v17, -1, v2
+; GFX8-NEXT:    v_subrev_u16_e32 v14, 0x8000, v14
+; GFX8-NEXT:    v_max_i16_e32 v13, v13, v5
+; GFX8-NEXT:    v_min_i16_e32 v13, v13, v14
+; GFX8-NEXT:    v_max_i16_e32 v14, -1, v9
+; GFX8-NEXT:    v_subrev_u16_e32 v14, 0x7fff, v14
+; GFX8-NEXT:    v_min_i16_e32 v15, -1, v9
+; GFX8-NEXT:    v_subrev_u16_e32 v15, 0x8000, v15
+; GFX8-NEXT:    v_max_i16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v14, -1, v2
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v15
+; GFX8-NEXT:    v_subrev_u16_e32 v14, 0x7fff, v14
+; GFX8-NEXT:    v_min_i16_e32 v15, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v15
-; GFX8-NEXT:    v_max_i16_e32 v16, v16, v6
-; GFX8-NEXT:    v_min_i16_e32 v16, v16, v17
-; GFX8-NEXT:    v_max_i16_e32 v17, -1, v10
-; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v13
-; GFX8-NEXT:    v_min_i16_e32 v18, -1, v10
-; GFX8-NEXT:    v_sub_u16_e32 v18, v18, v15
-; GFX8-NEXT:    v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v17, -1, v3
-; GFX8-NEXT:    v_min_i16_e32 v6, v6, v18
-; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v13
-; GFX8-NEXT:    v_min_i16_e32 v18, -1, v3
+; GFX8-NEXT:    v_subrev_u16_e32 v15, 0x8000, v15
+; GFX8-NEXT:    v_max_i16_e32 v14, v14, v6
+; GFX8-NEXT:    v_min_i16_e32 v14, v14, v15
+; GFX8-NEXT:    v_max_i16_e32 v15, -1, v10
+; GFX8-NEXT:    v_subrev_u16_e32 v15, 0x7fff, v15
+; GFX8-NEXT:    v_min_i16_e32 v16, -1, v10
+; GFX8-NEXT:    v_subrev_u16_e32 v16, 0x8000, v16
+; GFX8-NEXT:    v_max_i16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v15, -1, v3
+; GFX8-NEXT:    v_min_i16_e32 v6, v6, v16
+; GFX8-NEXT:    v_subrev_u16_e32 v15, 0x7fff, v15
+; GFX8-NEXT:    v_min_i16_e32 v16, -1, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX8-NEXT:    v_sub_u16_e32 v18, v18, v15
-; GFX8-NEXT:    v_max_i16_e32 v17, v17, v7
-; GFX8-NEXT:    v_min_i16_e32 v17, v17, v18
-; GFX8-NEXT:    v_max_i16_e32 v18, -1, v11
-; GFX8-NEXT:    v_sub_u16_e32 v13, v18, v13
-; GFX8-NEXT:    v_min_i16_e32 v18, -1, v11
+; GFX8-NEXT:    v_subrev_u16_e32 v16, 0x8000, v16
+; GFX8-NEXT:    v_max_i16_e32 v15, v15, v7
+; GFX8-NEXT:    v_min_i16_e32 v15, v15, v16
+; GFX8-NEXT:    v_max_i16_e32 v16, -1, v11
+; GFX8-NEXT:    v_subrev_u16_e32 v16, 0x7fff, v16
+; GFX8-NEXT:    v_min_i16_e32 v17, -1, v11
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v12
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_sub_u16_e32 v15, v18, v15
-; GFX8-NEXT:    v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_subrev_u16_e32 v17, 0x8000, v17
+; GFX8-NEXT:    v_max_i16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v14
+; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v13
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_i16_e32 v7, v7, v15
+; GFX8-NEXT:    v_min_i16_e32 v7, v7, v17
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v16
+; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v14
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_sub_u16_e32 v3, v3, v17
+; GFX8-NEXT:    v_sub_u16_e32 v3, v3, v15
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index 59d9ef9ec4257..5431466c8d8c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -80,11 +80,10 @@ define i8 @v_usubo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: v_usubo_i8:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -93,11 +92,10 @@ define i8 @v_usubo_i8(i8 %a, i8 %b) {
 ; GFX8-LABEL: v_usubo_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -123,11 +121,10 @@ define i7 @v_usubo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: v_usubo_i7:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0x7f
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -136,11 +133,10 @@ define i7 @v_usubo_i7(i7 %a, i7 %b) {
 ; GFX8-LABEL: v_usubo_i7:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -149,11 +145,10 @@ define i7 @v_usubo_i7(i7 %a, i7 %b) {
 ; GFX9-LABEL: v_usubo_i7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -576,11 +571,10 @@ define i8 @s_usubo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: s_usubo_i8:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -589,11 +583,10 @@ define i8 @s_usubo_i8(i8 %a, i8 %b) {
 ; GFX8-LABEL: s_usubo_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -619,11 +612,10 @@ define i7 @s_usubo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: s_usubo_i7:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0x7f
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -632,11 +624,10 @@ define i7 @s_usubo_i7(i7 %a, i7 %b) {
 ; GFX8-LABEL: s_usubo_i7:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7f
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -645,11 +636,10 @@ define i7 @s_usubo_i7(i7 %a, i7 %b) {
 ; GFX9-LABEL: s_usubo_i7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7f
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
@@ -966,11 +956,10 @@ define amdgpu_ps i32 @usubo_i32_sv(i32 inreg %a, i32 %b) {
 define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
 ; GFX7-LABEL: usubo_i16_sv:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s1, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -979,11 +968,10 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
 ;
 ; GFX8-LABEL: usubo_i16_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s1, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
index fc67a417c709b..0cb346c35552c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
@@ -138,12 +138,11 @@ define <2 x i32> @v_trunc_v4i32_to_v4i16(<4 x i32> %src) {
 ; GFX7-LABEL: v_trunc_v4i32_to_v4i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 5134da030ec2e..16de70970ce7a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -452,7 +452,7 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -617,7 +617,7 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
index 6c262ee828cdc..5721a4c9483d5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -308,9 +308,8 @@ define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GISEL-NEXT:    v_lshl_b32_e32 v2, s4, v2
-; GISEL-NEXT:    v_lshl_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
+; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -354,9 +353,8 @@ define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s4, 0x1000
-; CGP-NEXT:    v_lshl_b32_e32 v2, s4, v2
-; CGP-NEXT:    v_lshl_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
+; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -413,9 +411,8 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_udiv_i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -440,9 +437,8 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
 ; CGP-LABEL: v_udiv_i32_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v2, v2
@@ -477,11 +473,10 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_udiv_v2i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
-; GISEL-NEXT:    v_and_b32_e32 v2, s4, v2
-; GISEL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -525,11 +520,10 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-LABEL: v_udiv_v2i32_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
-; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
-; CGP-NEXT:    v_and_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 3a7625ea9e362..ebb7381d0ae1a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -1838,9 +1838,8 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-LABEL: v_udiv_i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v2
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -1866,9 +1865,8 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-LABEL: v_udiv_i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v2, v1
@@ -1892,19 +1890,18 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_udiv_v2i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s6, 0xffffff
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT:    v_and_b32_e32 v1, s6, v4
-; GISEL-NEXT:    v_and_b32_e32 v3, s6, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
 ; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
 ; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v7
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
@@ -1916,17 +1913,17 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v11
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v15, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v12
+; GISEL-NEXT:    v_mul_lo_u32 v15, v5, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v19, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v17
 ; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v17
@@ -1936,56 +1933,56 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v14
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v11, v17
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v12
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v18
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v6, v16
+; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v16
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v15, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v18
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v18
+; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; GISEL-NEXT:    v_mul_lo_u32 v19, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v16
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v6, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
+; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v6, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v20, vcc, v20, v6
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v2
 ; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s5, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s7, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s8, -1, 0x10000
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, s6, v2
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_mov_b32_e32 v16, s4
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v18, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_mov_b32_e32 v15, s4
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v20, v18
 ; GISEL-NEXT:    v_mov_b32_e32 v19, s5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_mov_b32_e32 v15, s7
+; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v16
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT:    v_mov_b32_e32 v16, s6
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v14, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v2
+; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
 ; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v13, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
@@ -1994,7 +1991,7 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v12
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v5, v2, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v11
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -2007,12 +2004,12 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v2, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v11, v9
@@ -2029,7 +2026,7 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; GISEL-NEXT:    v_mov_b32_e32 v18, s8
+; GISEL-NEXT:    v_mov_b32_e32 v18, s7
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
@@ -2040,122 +2037,121 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, 0, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v17, v2, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v0, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v9
+; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v13, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v14
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v1, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v2
+; GISEL-NEXT:    v_mul_lo_u32 v12, 0, v2
+; GISEL-NEXT:    v_mul_hi_u32 v13, v1, v2
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v14, 0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v3, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v6
+; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v5
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], 1, v7
-; GISEL-NEXT:    v_addc_u32_e64 v17, s[6:7], 0, v6, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v11
+; GISEL-NEXT:    v_addc_u32_e64 v17, s[6:7], 0, v5, s[6:7]
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v0, v11
 ; GISEL-NEXT:    v_subb_u32_e64 v11, s[8:9], 0, v10, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, v16, v13, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v15, v13, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v15, v16, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v16, v15, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], 1, v12
 ; GISEL-NEXT:    v_addc_u32_e64 v16, s[4:5], 0, v14, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 0, v10
 ; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v3
 ; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
-; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 1, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 1, v8
 ; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, v17, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e64 v9, s[6:7], 0, v9
 ; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v6, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v19, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v12, v15, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v8, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v12, v15, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v8, v0, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v8, v14, v16, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s6, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s6, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s6, v2
-; CGP-NEXT:    v_and_b32_e32 v2, s6, v4
-; CGP-NEXT:    v_and_b32_e32 v3, s6, v6
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v1
@@ -2176,8 +2172,8 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_and_b32_e32 v0, s6, v0
-; CGP-NEXT:    v_and_b32_e32 v2, s6, v1
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 0f1b97e9d3ed8..6c4b3ea921703 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -715,7 +715,6 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-LABEL: udivrem_v4i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s12
@@ -726,75 +725,75 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v0
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s13
-; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX8-NEXT:    v_mul_lo_u32 v3, v0, s12
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s12
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v1, s13
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v5
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v5, v6
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s13, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s12, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s12, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v6
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s9, v5
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s13, v2
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s14
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, s15
-; GFX8-NEXT:    v_mul_lo_u32 v7, s0, v5
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX8-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX8-NEXT:    v_mul_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX8-NEXT:    v_mul_hi_u32 v7, s10, v5
-; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s13, v3
+; GFX8-NEXT:    v_mul_lo_u32 v6, s0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s15
+; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v6
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v5
+; GFX8-NEXT:    v_mul_hi_u32 v3, s10, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s13, v2
+; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s15
-; GFX8-NEXT:    v_mul_lo_u32 v6, s0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v7, s14
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v7
-; GFX8-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s10, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s14, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v2, v5, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, v3, s14
+; GFX8-NEXT:    v_mul_lo_u32 v7, s0, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v3
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s10, v2
+; GFX8-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v8, s11, v2
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v7
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, v8, s15
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s14, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s11, v7
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v8
+; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s14, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v2, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v7
+; GFX8-NEXT:    v_mul_hi_u32 v7, s11, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, s15
+; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s14, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s11, v3
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v7
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s15, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v7
@@ -814,7 +813,6 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-LABEL: udivrem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
@@ -822,85 +820,85 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-NEXT:    s_sub_i32 s1, 0, s13
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s14
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s14
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v1
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s14
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s12
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s13
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s12, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s12, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s15
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s12
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s13
+; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
+; GFX9-NEXT:    v_sub_u32_e32 v7, s9, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s12, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s12, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, s4, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s15
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s13, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v5
+; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v8, s13, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v3
+; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v6
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s15
-; GFX9-NEXT:    v_add_u32_e32 v8, 1, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v7, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, s4, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v8, s13, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s14
-; GFX9-NEXT:    v_add_u32_e32 v8, 1, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v2
-; GFX9-NEXT:    v_sub_u32_e32 v6, s10, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s14
+; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s13, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, s10, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v8
+; GFX9-NEXT:    v_add_u32_e32 v8, 1, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s14, v6
-; GFX9-NEXT:    v_mul_lo_u32 v8, v7, s15
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
+; GFX9-NEXT:    v_mul_hi_u32 v3, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s14, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v8, v3, s15
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s14, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v3, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v3, s11, v8
-; GFX9-NEXT:    v_add_u32_e32 v8, 1, v7
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s14, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v7, s11, v8
+; GFX9-NEXT:    v_add_u32_e32 v8, 1, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v8, s15, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX9-NEXT:    v_add_u32_e32 v8, 1, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v8, s15, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v8, s15, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v7
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s15, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -2003,16 +2001,15 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_short v[0:1], v4
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -2047,7 +2044,6 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s8, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s7
@@ -2070,7 +2066,7 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -2078,7 +2074,7 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
@@ -2269,7 +2265,6 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX8-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s2, s1, 0xffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -2277,7 +2272,7 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX8-NEXT:    s_sub_i32 s1, 0, s2
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_lshr_b32 s9, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -2292,7 +2287,7 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, s8, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
@@ -2306,7 +2301,7 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v3
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -2316,11 +2311,11 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, s8, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v0, s8, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
@@ -2592,7 +2587,6 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX8-NEXT:    s_and_b32 s4, s6, 0x7ffffff
-; GFX8-NEXT:    s_mov_b32 s5, 0x7ffffff
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
@@ -2610,11 +2604,11 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
-; GFX8-NEXT:    v_and_b32_e32 v2, s5, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7ffffff, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_and_b32_e32 v2, s5, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7ffffff, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -2633,7 +2627,6 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s4, 0x7ffffff
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
@@ -2649,10 +2642,10 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v1
 ; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index df31f96f177a1..004c6d7ab9ada 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -117,14 +117,13 @@ define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT:    s_mov_b32 s4, 0x4f7ffffe
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v4, s4, v4
-; GISEL-NEXT:    v_mul_f32_e32 v6, s4, v6
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
@@ -217,9 +216,8 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CHECK-LABEL: v_urem_v2i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0xfff
-; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
-; CHECK-NEXT:    v_and_b32_e32 v1, s4, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xfff, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, <i32 4096, i32 4096>
   ret <2 x i32> %result
@@ -257,14 +255,14 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
-; GISEL-NEXT:    v_mov_b32_e32 v3, 0xffed2705
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v3, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v4, s4
@@ -351,18 +349,16 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-LABEL: v_urem_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GISEL-NEXT:    s_mov_b32 s5, 0x4f7ffffe
-; GISEL-NEXT:    v_lshl_b32_e32 v2, s4, v2
-; GISEL-NEXT:    v_lshl_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
+; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v4, s5, v4
-; GISEL-NEXT:    v_mul_f32_e32 v6, s5, v6
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
@@ -394,9 +390,8 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-LABEL: v_urem_v2i32_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s4, 0x1000
-; CGP-NEXT:    v_lshl_b32_e32 v2, s4, v2
-; CGP-NEXT:    v_lshl_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
+; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -449,9 +444,8 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_urem_i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -474,9 +468,8 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
 ; CGP-LABEL: v_urem_i32_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v2, v2
@@ -509,20 +502,18 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_urem_v2i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    s_mov_b32 s5, 0x4f7ffffe
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
-; GISEL-NEXT:    v_and_b32_e32 v2, s4, v2
-; GISEL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v4, s5, v4
-; GISEL-NEXT:    v_mul_f32_e32 v6, s5, v6
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
@@ -554,11 +545,10 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-LABEL: v_urem_v2i32_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
-; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
-; CGP-NEXT:    v_and_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index a08c9277393ea..e6b5660d000f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -951,9 +951,8 @@ define <2 x i64> @v_urem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CHECK-LABEL: v_urem_v2i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0xfff
-; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
-; CHECK-NEXT:    v_and_b32_e32 v2, s4, v2
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xfff, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -967,34 +966,34 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s5, 0xffed2705
 ; CHECK-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; CHECK-NEXT:    v_mov_b32_e32 v5, s6
-; CHECK-NEXT:    v_mov_b32_e32 v6, s7
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v4
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v3
+; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v6
 ; CHECK-NEXT:    v_mul_lo_u32 v8, s5, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v9, -1, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v10, s5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
-; CHECK-NEXT:    v_mul_lo_u32 v12, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v12, v6, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
@@ -1010,20 +1009,20 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v8, -1, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v9, s5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, s5, v4
-; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v10, s5, v6
+; CHECK-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v8
+; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
@@ -1039,14 +1038,14 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v8, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v0, v4
-; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v4
-; CHECK-NEXT:    v_mul_hi_u32 v11, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v9, v0, v6
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v0, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v1, v6
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
@@ -1063,32 +1062,32 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v8, s4, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v9, 0, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, s4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v6
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v3, vcc
+; CHECK-NEXT:    v_subb_u32_e64 v6, s[4:5], v1, v3, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v3
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, 1235195
   ret i64 %result
@@ -1100,62 +1099,62 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s8, 0x12d8fb
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, s8
+; GISEL-NEXT:    v_madmk_f32 v6, v5, 0x4f800000, v7
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s5, -1, 0x10000
-; GISEL-NEXT:    v_madmk_f32 v9, v7, 0x4f800000, v8
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GISEL-NEXT:    v_mov_b32_e32 v6, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GISEL-NEXT:    s_sub_u32 s9, 0, 0x12d8fb
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; GISEL-NEXT:    s_subb_u32 s10, 0, 0
 ; GISEL-NEXT:    s_bfe_i32 s11, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s12, -1, 0x10000
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v7
-; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v8
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
+; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v7
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
 ; GISEL-NEXT:    v_trunc_f32_e32 v10, v10
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v9
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v10
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v10
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, s6, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, s7, v7
-; GISEL-NEXT:    v_mul_hi_u32 v15, s6, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, s9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v17, s10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v18, s9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, s6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v14, s7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, s6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v16, s9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v17, s10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v18, s9, v7
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v13
-; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v19, v8, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v16
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v16
+; GISEL-NEXT:    v_mul_hi_u32 v15, v7, v16
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v10, v16
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v12
+; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v11
+; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v11
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v19
@@ -1167,7 +1166,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; GISEL-NEXT:    v_mul_hi_u32 v18, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
@@ -1182,38 +1181,38 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v7
-; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v16
+; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v15, s10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v16, s9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v15, s10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v16, s9, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v17, s6, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v19, v8, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
 ; GISEL-NEXT:    v_mul_lo_u32 v17, s9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v15
+; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v15
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v13
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v19
 ; GISEL-NEXT:    v_mul_lo_u32 v19, v10, v15
@@ -1224,7 +1223,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT:    v_mul_hi_u32 v17, v8, v15
+; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v15
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
@@ -1242,16 +1241,16 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v16
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v14, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v8
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v3, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v9
@@ -1266,55 +1265,55 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v10
-; GISEL-NEXT:    v_add_i32_e64 v7, s[6:7], v16, v7
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v16, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v13, v8
+; GISEL-NEXT:    v_add_i32_e64 v7, s[6:7], v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_lo_u32 v14, s8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, s8, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_mul_lo_u32 v12, s8, v8
-; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v14, s8, v8
+; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v8, s8, v8
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_mul_lo_u32 v12, s8, v7
+; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, s8, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v7
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v8
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v12
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[6:7], v3, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v8
+; GISEL-NEXT:    v_subb_u32_e64 v10, s[6:7], v3, v7, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v7
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v19, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -1351,62 +1350,62 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_mov_b32 s8, 0x12d8fb
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
+; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_mov_b32 s6, 0xffed2705
 ; CGP-NEXT:    s_bfe_i32 s4, -1, 0x10000
 ; CGP-NEXT:    s_bfe_i32 s5, -1, 0x10000
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v7, 0x12d8fb
+; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v8, 0
 ; CGP-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CGP-NEXT:    s_bfe_i32 s9, -1, 0x10000
-; CGP-NEXT:    v_cvt_f32_u32_e32 v7, s8
-; CGP-NEXT:    v_mov_b32_e32 v8, s4
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT:    v_mov_b32_e32 v6, s4
 ; CGP-NEXT:    v_mov_b32_e32 v9, s5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v5
-; CGP-NEXT:    v_mac_f32_e32 v10, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v7
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v10
+; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
-; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v6
-; CGP-NEXT:    v_trunc_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v5
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v7
+; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_trunc_f32_e32 v10, v10
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v10
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_mul_lo_u32 v12, s6, v10
 ; CGP-NEXT:    v_mul_lo_u32 v13, s6, v5
 ; CGP-NEXT:    v_mul_lo_u32 v14, -1, v5
 ; CGP-NEXT:    v_mul_hi_u32 v15, s6, v5
-; CGP-NEXT:    v_mul_lo_u32 v16, s6, v6
-; CGP-NEXT:    v_mul_lo_u32 v17, -1, v6
-; CGP-NEXT:    v_mul_hi_u32 v18, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v16, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v17, -1, v7
+; CGP-NEXT:    v_mul_hi_u32 v18, s6, v7
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v8, v13
 ; CGP-NEXT:    v_mul_hi_u32 v19, v5, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
 ; CGP-NEXT:    v_mul_lo_u32 v17, v10, v16
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v6, v16
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v16
 ; CGP-NEXT:    v_mul_hi_u32 v16, v10, v16
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT:    v_mul_lo_u32 v18, v6, v12
+; CGP-NEXT:    v_mul_lo_u32 v18, v7, v12
 ; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; CGP-NEXT:    v_mul_lo_u32 v15, v5, v11
-; CGP-NEXT:    v_mul_lo_u32 v17, v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v17, v8, v11
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
 ; CGP-NEXT:    v_mul_hi_u32 v14, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v19
 ; CGP-NEXT:    v_mul_lo_u32 v19, v10, v12
@@ -1417,7 +1416,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_mul_hi_u32 v18, v6, v12
+; CGP-NEXT:    v_mul_hi_u32 v18, v7, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
@@ -1433,33 +1432,33 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v11, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v5
 ; CGP-NEXT:    v_mul_lo_u32 v13, -1, v5
 ; CGP-NEXT:    v_mul_hi_u32 v14, s6, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v16
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
 ; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, -1, v6
-; CGP-NEXT:    v_mul_hi_u32 v16, s6, v6
-; CGP-NEXT:    v_mul_lo_u32 v17, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v18, v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v15, -1, v7
+; CGP-NEXT:    v_mul_hi_u32 v16, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v17, s6, v8
+; CGP-NEXT:    v_mul_lo_u32 v18, v8, v11
 ; CGP-NEXT:    v_mul_hi_u32 v19, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
 ; CGP-NEXT:    v_mul_lo_u32 v17, s6, v10
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
 ; CGP-NEXT:    v_mul_lo_u32 v17, v10, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v6, v12
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT:    v_mul_lo_u32 v16, v6, v15
+; CGP-NEXT:    v_mul_lo_u32 v16, v7, v15
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; CGP-NEXT:    v_mul_lo_u32 v14, v5, v13
-; CGP-NEXT:    v_mul_lo_u32 v16, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v16, v8, v13
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
@@ -1474,7 +1473,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_mul_hi_u32 v17, v6, v15
+; CGP-NEXT:    v_mul_hi_u32 v17, v7, v15
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
@@ -1485,7 +1484,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
 ; CGP-NEXT:    v_mov_b32_e32 v18, s9
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
@@ -1493,19 +1492,19 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v16
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v13, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v1, v5
 ; CGP-NEXT:    v_mul_hi_u32 v13, v0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v14, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, v0, v7
-; CGP-NEXT:    v_mul_lo_u32 v16, v1, v7
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v7
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT:    v_mul_lo_u32 v15, v0, v8
+; CGP-NEXT:    v_mul_lo_u32 v16, v1, v8
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
@@ -1518,57 +1517,57 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_hi_u32 v10, v3, v10
 ; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v16, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v6, s[6:7], v13, v6
+; CGP-NEXT:    v_add_i32_e64 v7, s[6:7], v13, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_mul_lo_u32 v14, s8, v5
 ; CGP-NEXT:    v_mul_lo_u32 v15, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, s8, v5
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, s8, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, 0, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, s8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, s8, v7
+; CGP-NEXT:    v_mul_lo_u32 v16, 0, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, s8, v7
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_mul_lo_u32 v7, s8, v7
+; CGP-NEXT:    v_mul_lo_u32 v8, s8, v8
 ; CGP-NEXT:    v_mul_lo_u32 v10, s8, v10
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v1, v5, vcc
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v5
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v12
-; CGP-NEXT:    v_subb_u32_e64 v10, s[6:7], v3, v6, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v6
+; CGP-NEXT:    v_subb_u32_e64 v10, s[6:7], v3, v7, s[4:5]
+; CGP-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v7
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[6:7]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[6:7]
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[6:7]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v19, v7, vcc
 ; CGP-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v0, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v12, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -1576,23 +1575,23 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CGP-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v8, v4
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v7, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v12, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
 ; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[4:5]
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 1235195, i64 1235195>
@@ -2367,9 +2366,8 @@ define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-LABEL: v_urem_i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
-; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, s4, v2
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -2393,9 +2391,8 @@ define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-LABEL: v_urem_i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v4, v3
@@ -2421,19 +2418,18 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_urem_v2i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s6, 0xffffff
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT:    v_and_b32_e32 v3, s6, v4
-; GISEL-NEXT:    v_and_b32_e32 v1, s6, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v6
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
 ; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
 ; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v7
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
@@ -2445,17 +2441,17 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v11
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v15, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v12
+; GISEL-NEXT:    v_mul_lo_u32 v15, v5, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v19, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v17
 ; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v17
@@ -2465,56 +2461,56 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v14
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v11, v17
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v12
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v18
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v6, v16
+; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v16
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v15, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v18
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v18
+; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; GISEL-NEXT:    v_mul_lo_u32 v19, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v16
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v6, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
+; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v6, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v20, vcc, v20, v6
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v2
 ; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s5, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s7, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s8, -1, 0x10000
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, s6, v2
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_mov_b32_e32 v16, s4
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v18, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_mov_b32_e32 v15, s4
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v20, v18
 ; GISEL-NEXT:    v_mov_b32_e32 v19, s5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_mov_b32_e32 v15, s7
+; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v16
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT:    v_mov_b32_e32 v16, s6
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v14, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v2
+; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
 ; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v13, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
@@ -2523,7 +2519,7 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v12
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v5, v2, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v11
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -2536,12 +2532,12 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v2, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v11, v9
@@ -2558,7 +2554,7 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; GISEL-NEXT:    v_mov_b32_e32 v18, s8
+; GISEL-NEXT:    v_mov_b32_e32 v18, s7
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
@@ -2569,118 +2565,117 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, 0, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v17, v2, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v0, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v9
+; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v13, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v14
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v2
+; GISEL-NEXT:    v_mul_lo_u32 v12, 0, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GISEL-NEXT:    v_mul_lo_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], 0, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], 0, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v7
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v6, v8
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], 0, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], 0, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v11
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], 0, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v11
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], 0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[6:7]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v16, v7, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v15, v9, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v0, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[6:7]
+; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v5, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v2, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v8, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v13
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v2, v14, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v13, v15, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_24bit:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s6, 0xffffff
-; CGP-NEXT:    v_and_b32_e32 v0, s6, v0
-; CGP-NEXT:    v_and_b32_e32 v1, s6, v2
-; CGP-NEXT:    v_and_b32_e32 v2, s6, v4
-; CGP-NEXT:    v_and_b32_e32 v3, s6, v6
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
+; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v1
@@ -2705,8 +2700,8 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v3, v5, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT:    v_and_b32_e32 v0, s6, v0
-; CGP-NEXT:    v_and_b32_e32 v2, s6, v1
+; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index fe46feea4ebbf..3de02ca279fa7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -440,7 +440,7 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -601,7 +601,7 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index f88ce8b36825c..32fdf242a28e1 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -205,8 +205,7 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9PLUS: global_load_dword [[B:v[0-9]+]]
 
 ; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
-; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
-; GFX10-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
 ; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9PLUS: buffer_store_dwordx4
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 6b6734efae924..089ee88e78fca 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -1177,7 +1177,6 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-LABEL: udiv_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s3, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s14, -1
@@ -1189,9 +1188,9 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s11
-; GFX6-NEXT:    v_mul_f32_e32 v0, s3, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s3, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s9
@@ -1216,7 +1215,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
-; GFX6-NEXT:    v_mul_f32_e32 v2, s3, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
@@ -1232,7 +1231,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX6-NEXT:    v_mul_f32_e32 v4, s3, v4
+; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s10
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
@@ -1264,9 +1263,8 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-LABEL: udiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s12, 0x4f7ffffe
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
@@ -1275,75 +1273,75 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
-; GFX9-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s12, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s10
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v5
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s11
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s8
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v5
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, s9
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v2
+; GFX9-NEXT:    s_sub_i32 s2, 0, s11
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s9
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX9-NEXT:    v_add_u32_e32 v8, 1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
-; GFX9-NEXT:    v_sub_u32_e32 v6, s5, v6
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
-; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    s_sub_i32 s2, 0, s11
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v6
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v6
+; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s10
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GFX9-NEXT:    v_mul_lo_u32 v8, v3, s10
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
+; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v2
-; GFX9-NEXT:    v_sub_u32_e32 v6, s6, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s10, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v5, s11
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s10, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s11
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v6
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
+; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v3
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <4 x i32> %x, %y
@@ -1479,26 +1477,25 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-LABEL: urem_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_i32 s2, 0, s8
-; GFX6-NEXT:    s_sub_i32 s12, 0, s9
+; GFX6-NEXT:    s_sub_i32 s12, 0, s8
+; GFX6-NEXT:    s_sub_i32 s13, 0, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
-; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
@@ -1506,7 +1503,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -1526,7 +1523,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX6-NEXT:    s_sub_i32 s4, 0, s11
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, s13, v4
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
@@ -1558,53 +1555,52 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-LABEL: urem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s12, 0x4f7ffffe
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    s_sub_i32 s3, 0, s9
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s12, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX9-NEXT:    s_sub_i32 s2, 0, s10
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v5
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v1
+; GFX9-NEXT:    s_sub_i32 s2, 0, s10
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s11
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s11
-; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
+; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s10
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s10
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
@@ -1809,7 +1805,6 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-LABEL: sdiv_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s16, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s14, -1
@@ -1825,13 +1820,13 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX6-NEXT:    s_sub_i32 s1, 0, s3
 ; GFX6-NEXT:    s_ashr_i32 s0, s4, 31
-; GFX6-NEXT:    v_mul_f32_e32 v0, s16, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s1, v0
 ; GFX6-NEXT:    s_add_i32 s1, s4, s0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s16, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -1865,7 +1860,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v3, s16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
@@ -1893,7 +1888,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
-; GFX6-NEXT:    v_mul_f32_e32 v4, s16, v4
+; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
@@ -1932,9 +1927,8 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-LABEL: sdiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s15, 0x4f7ffffe
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
 ; GFX9-NEXT:    s_add_i32 s3, s8, s2
@@ -1947,13 +1941,13 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_sub_i32 s14, 0, s3
 ; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
-; GFX9-NEXT:    v_mul_f32_e32 v0, s15, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s15, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_sub_i32 s14, 0, s9
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
@@ -1988,7 +1982,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -2008,7 +2002,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    s_xor_b32 s6, s6, s5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    s_xor_b32 s2, s13, s12
@@ -2215,7 +2209,6 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-LABEL: srem_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s14, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2223,29 +2216,30 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    s_add_i32 s8, s8, s2
 ; GFX6-NEXT:    s_xor_b32 s8, s8, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
-; GFX6-NEXT:    s_add_i32 s9, s9, s12
-; GFX6-NEXT:    s_xor_b32 s9, s9, s12
+; GFX6-NEXT:    s_ashr_i32 s13, s9, 31
+; GFX6-NEXT:    s_add_i32 s9, s9, s13
+; GFX6-NEXT:    s_xor_b32 s9, s9, s13
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s13, 0, s8
+; GFX6-NEXT:    s_sub_i32 s14, 0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX6-NEXT:    s_ashr_i32 s12, s4, 31
-; GFX6-NEXT:    v_mul_f32_e32 v0, s14, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    s_add_i32 s4, s4, s12
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s12
-; GFX6-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s14, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_sub_i32 s13, 0, s9
+; GFX6-NEXT:    s_sub_i32 s14, 0, s9
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s13, v1
 ; GFX6-NEXT:    s_ashr_i32 s13, s5, 31
 ; GFX6-NEXT:    s_add_i32 s5, s5, s13
+; GFX6-NEXT:    s_xor_b32 s5, s5, s13
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v1
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -2253,26 +2247,25 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; GFX6-NEXT:    s_xor_b32 s4, s5, s13
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    s_ashr_i32 s5, s10, 31
+; GFX6-NEXT:    s_ashr_i32 s4, s10, 31
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    s_add_i32 s8, s10, s5
-; GFX6-NEXT:    s_xor_b32 s5, s8, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX6-NEXT:    s_add_i32 s8, s10, s4
+; GFX6-NEXT:    s_xor_b32 s4, s8, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
-; GFX6-NEXT:    v_mul_f32_e32 v2, s14, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
-; GFX6-NEXT:    s_sub_i32 s4, 0, s5
+; GFX6-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
@@ -2280,22 +2273,22 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
 ; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX6-NEXT:    s_add_i32 s9, s11, s8
-; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
+; GFX6-NEXT:    s_ashr_i32 s5, s6, 31
 ; GFX6-NEXT:    s_xor_b32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s6, s6, s4
+; GFX6-NEXT:    s_add_i32 s6, s6, s5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GFX6-NEXT:    s_xor_b32 s6, s6, s4
+; GFX6-NEXT:    s_xor_b32 s6, s6, s5
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s13, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s13, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s5
-; GFX6-NEXT:    v_mul_f32_e32 v3, s14, v3
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
 ; GFX6-NEXT:    s_sub_i32 s6, 0, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
@@ -2303,14 +2296,14 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    s_add_i32 s7, s7, s6
 ; GFX6-NEXT:    s_xor_b32 s7, s7, s6
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, s5, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s8
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s5, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
@@ -2326,37 +2319,37 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-LABEL: srem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s13, 0x4f7ffffe
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
 ; GFX9-NEXT:    s_add_i32 s8, s8, s2
 ; GFX9-NEXT:    s_xor_b32 s2, s8, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
-; GFX9-NEXT:    s_sub_i32 s12, 0, s2
 ; GFX9-NEXT:    s_add_i32 s8, s9, s3
+; GFX9-NEXT:    s_sub_i32 s12, 0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_xor_b32 s3, s8, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
-; GFX9-NEXT:    v_mul_f32_e32 v0, s13, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s13, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_sub_i32 s12, 0, s3
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    s_add_i32 s5, s5, s9
+; GFX9-NEXT:    s_ashr_i32 s12, s10, 31
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX9-NEXT:    s_add_i32 s5, s5, s9
 ; GFX9-NEXT:    s_xor_b32 s5, s5, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
@@ -2367,9 +2360,8 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX9-NEXT:    s_ashr_i32 s2, s10, 31
-; GFX9-NEXT:    s_add_i32 s4, s10, s2
-; GFX9-NEXT:    s_xor_b32 s2, s4, s2
+; GFX9-NEXT:    s_add_i32 s2, s10, s12
+; GFX9-NEXT:    s_xor_b32 s2, s2, s12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
@@ -2379,7 +2371,7 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, s13, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
@@ -2398,7 +2390,7 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    s_xor_b32 s5, s5, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
-; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v5
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
@@ -2529,32 +2521,32 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    s_and_b32 s9, s4, 0xffff
+; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -2566,18 +2558,17 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
 ; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2633,9 +2624,8 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
@@ -2748,65 +2738,64 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    v_alignbit_b32 v4, s7, v4, 16
-; GFX6-NEXT:    s_and_b32 s9, s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v5, s8, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    v_alignbit_b32 v3, s5, v3, 16
-; GFX6-NEXT:    v_and_b32_e32 v6, s8, v3
-; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, v6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
-; GFX6-NEXT:    v_mad_f32 v1, -v1, v5, v6
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX6-NEXT:    v_mad_f32 v2, -v2, v5, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
 ; GFX6-NEXT:    s_and_b32 s6, s7, 0xffff
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
 ; GFX6-NEXT:    s_and_b32 s6, s5, 0xffff
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v1
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v1, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s6
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX6-NEXT:    v_mad_f32 v4, -v1, v2, v4
+; GFX6-NEXT:    v_mad_f32 v4, -v1, v3, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v2
-; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mad_f32 v2, -v2, v5, v6
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -2868,10 +2857,9 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s6
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
@@ -3053,12 +3041,11 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -3132,11 +3119,10 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
-; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v5, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
@@ -3338,12 +3324,11 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -3425,11 +3410,10 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s7
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s6, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, v4, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
@@ -3996,31 +3980,30 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
 ; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
-; GFX6-NEXT:    v_and_b32_e32 v5, s8, v2
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v4
-; GFX6-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
-; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
+; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_alignbit_b32 v0, s5, v0, 16
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
 ; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
@@ -4041,11 +4024,11 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -4607,30 +4590,29 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s8, s2, 0x7fff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GFX6-NEXT:    s_and_b32 s9, s0, 0x7fff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX6-NEXT:    s_and_b32 s8, s0, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX6-NEXT:    s_and_b32 s3, s2, 0x7fff
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf000f
-; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
 ; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
-; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
@@ -4644,9 +4626,9 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
 ; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s3, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_and_b32_e32 v3, s3, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -4661,18 +4643,17 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX9-NEXT:    s_and_b32 s3, s2, 0x7fff
-; GFX9-NEXT:    s_and_b32 s7, s0, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s6, s2, 0x7fff
+; GFX9-NEXT:    s_and_b32 s3, s0, 0x7fff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0xf000f
@@ -4680,14 +4661,14 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, s6, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
-; GFX9-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
@@ -4701,9 +4682,9 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
 ; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, s6, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_and_b32_e32 v4, s6, v5
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
@@ -4796,55 +4777,54 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s9, s2, 0x7fff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
-; GFX6-NEXT:    s_and_b32 s10, s0, 0x7fff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
+; GFX6-NEXT:    s_and_b32 s8, s2, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; GFX6-NEXT:    s_and_b32 s9, s0, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
 ; GFX6-NEXT:    s_bfe_u32 s1, s0, 0xf000f
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_bfe_u32 s9, s2, 0xf000f
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
-; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
-; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
-; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
+; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
-; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
+; GFX6-NEXT:    s_lshr_b32 s3, s2, 15
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v3, s3, v3
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_and_b32_e32 v2, s3, v6
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -4858,33 +4838,32 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s6, s2, 0x7fff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX9-NEXT:    s_and_b32 s3, s2, 0x7fff
-; GFX9-NEXT:    s_and_b32 s8, s0, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GFX9-NEXT:    s_bfe_u32 s3, s0, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    s_and_b32 s7, s0, 0x7fff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0xf000f
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_alignbit_b32 v3, s1, v3, 30
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0xf000f
+; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX9-NEXT:    s_bfe_u32 s3, s2, 0xf000f
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, s6, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
@@ -4907,9 +4886,9 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v3, s6, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
@@ -5057,11 +5036,10 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX6-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -5128,10 +5106,9 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
 ; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, s0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, s0, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
@@ -5236,18 +5213,18 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_bfe_i32 s10, s2, 0xf0000
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s10
+; GFX6-NEXT:    s_bfe_i32 s9, s2, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
 ; GFX6-NEXT:    s_bfe_i32 s1, s0, 0xf0000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
-; GFX6-NEXT:    s_xor_b32 s1, s10, s1
+; GFX6-NEXT:    s_xor_b32 s1, s9, s1
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
 ; GFX6-NEXT:    s_or_b32 s1, s1, 1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s1
-; GFX6-NEXT:    s_lshr_b32 s9, s0, 15
+; GFX6-NEXT:    s_lshr_b32 s8, s0, 15
 ; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
 ; GFX6-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
@@ -5255,29 +5232,28 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s0
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s0
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, s1
-; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s2, v4
-; GFX6-NEXT:    v_mul_f32_e32 v7, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
 ; GFX6-NEXT:    s_xor_b32 s0, s1, s0
-; GFX6-NEXT:    v_mad_f32 v6, -v7, v5, v6
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 15
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX6-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX6-NEXT:    v_mad_f32 v6, -v7, v5, v6
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v7, v7
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v5|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v2
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    s_or_b32 s0, s0, 1
+; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s0
-; GFX6-NEXT:    v_and_b32_e32 v1, s3, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -5292,15 +5268,15 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s9
+; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s8, v5
+; GFX6-NEXT:    s_lshr_b32 s3, s2, 15
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v5
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_and_b32_e32 v3, s3, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -5314,46 +5290,45 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s6, s2, 0xf0000
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s6
+; GFX9-NEXT:    s_bfe_i32 s1, s2, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s1
+; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
+; GFX9-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v1, 30
-; GFX9-NEXT:    s_bfe_i32 s1, s0, 0xf0000
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
-; GFX9-NEXT:    s_xor_b32 s1, s6, s1
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 30
-; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    s_lshr_b32 s3, s2, 15
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 15
-; GFX9-NEXT:    s_or_b32 s1, s1, 1
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX9-NEXT:    s_lshr_b32 s8, s2, 15
+; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v5|, |v4|
-; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX9-NEXT:    v_add_u32_e32 v4, s1, v6
-; GFX9-NEXT:    s_bfe_i32 s1, s0, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s1
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s0
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s0
+; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
+; GFX9-NEXT:    s_lshr_b32 s3, s6, 15
+; GFX9-NEXT:    s_or_b32 s7, s0, 1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, s7, 0
+; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
+; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf000f
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
+; GFX9-NEXT:    s_bfe_i32 s1, s2, 0xf000f
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s1
+; GFX9-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    v_and_b32_e32 v3, s8, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
 ; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
 ; GFX9-NEXT:    v_mad_f32 v6, -v7, v5, v6
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v7
-; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
 ; GFX9-NEXT:    s_or_b32 s6, s0, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v6|, |v5|
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v1
@@ -5372,16 +5347,16 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_mad_f32 v7, -v7, v6, v8
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v5, s9
+; GFX9-NEXT:    v_mul_lo_u32 v5, v5, s3
 ; GFX9-NEXT:    v_add_u32_e32 v1, v9, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v5
+; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v4, s8, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
@@ -5671,7 +5646,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s0, 0x4f7ffffe
 ; GFX6-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5679,12 +5653,12 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s7
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX6-NEXT:    s_sub_i32 s0, 0, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
-; GFX6-NEXT:    s_sub_i32 s0, 0, s2
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s3
@@ -5721,20 +5695,19 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s6
 ; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX9-NEXT:    s_sub_i32 s3, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5759,12 +5732,12 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
@@ -6003,7 +5976,6 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6011,13 +5983,13 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s7
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
@@ -6055,16 +6027,15 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    s_mov_b32 s6, 0x4f7ffffe
+; GFX9-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
@@ -6471,7 +6442,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6481,55 +6451,55 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s11
-; GFX6-NEXT:    s_sub_i32 s11, 0, s2
-; GFX6-NEXT:    s_ashr_i32 s10, s0, 31
+; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
+; GFX6-NEXT:    s_add_i32 s0, s0, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_add_i32 s0, s0, s10
-; GFX6-NEXT:    s_ashr_i32 s3, s8, 31
-; GFX6-NEXT:    s_add_i32 s8, s8, s3
-; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
+; GFX6-NEXT:    s_sub_i32 s11, 0, s2
+; GFX6-NEXT:    s_xor_b32 s10, s0, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s12, s3, s1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v0
-; GFX6-NEXT:    s_xor_b32 s11, s0, s10
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s11
-; GFX6-NEXT:    s_xor_b32 s0, s8, s3
-; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    s_sub_i32 s3, 0, s11
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v2
+; GFX6-NEXT:    s_ashr_i32 s0, s8, 31
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX6-NEXT:    s_add_i32 s8, s8, s0
+; GFX6-NEXT:    v_mul_lo_u32 v2, s11, v0
+; GFX6-NEXT:    s_xor_b32 s8, s8, s0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v3, s3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    s_xor_b32 s11, s0, s1
+; GFX6-NEXT:    s_sub_i32 s0, 0, s10
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v3
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v3
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    s_add_i32 s1, s9, s0
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT:    s_xor_b32 s2, s0, s10
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s11
+; GFX6-NEXT:    s_xor_b32 s2, s0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX6-NEXT:    v_xor_b32_e32 v0, s11, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s11, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s11, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
@@ -6540,7 +6510,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0x4f7ffffe
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s6
@@ -6549,47 +6518,44 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s7
-; GFX9-NEXT:    s_ashr_i32 s9, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s9
+; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
+; GFX9-NEXT:    s_add_i32 s6, s6, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s6, s6, s9
+; GFX9-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    s_sub_i32 s10, 0, s0
-; GFX9-NEXT:    v_mul_f32_e32 v0, s11, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
 ; GFX9-NEXT:    s_add_i32 s4, s4, s7
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s11, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_xor_b32 s4, s4, s7
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GFX9-NEXT:    s_sub_i32 s10, 0, s6
-; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX9-NEXT:    s_add_i32 s5, s5, s8
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX9-NEXT:    s_xor_b32 s4, s4, s7
+; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v1
+; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT:    s_xor_b32 s5, s5, s8
-; GFX9-NEXT:    s_xor_b32 s1, s7, s1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
+; GFX9-NEXT:    s_add_i32 s5, s5, s9
+; GFX9-NEXT:    s_xor_b32 s5, s5, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v4
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    s_xor_b32 s0, s8, s9
-; GFX9-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -6597,7 +6563,10 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    s_xor_b32 s1, s7, s1
+; GFX9-NEXT:    s_xor_b32 s0, s9, s8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s1, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
@@ -6924,45 +6893,44 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s9, 0x4f7ffffe
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
 ; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
 ; GFX6-NEXT:    s_xor_b32 s6, s2, s3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s7
-; GFX6-NEXT:    s_ashr_i32 s7, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s7
+; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
+; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
+; GFX6-NEXT:    s_add_i32 s7, s7, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s7, s2, s7
+; GFX6-NEXT:    s_xor_b32 s7, s7, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_i32 s2, 0, s6
-; GFX6-NEXT:    v_mul_f32_e32 v0, s9, v0
+; GFX6-NEXT:    s_sub_i32 s9, 0, s6
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX6-NEXT:    s_add_i32 s2, s4, s8
-; GFX6-NEXT:    v_mul_f32_e32 v1, s9, v1
-; GFX6-NEXT:    s_xor_b32 s4, s2, s8
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    s_add_i32 s4, s4, s8
+; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX6-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_sub_i32 s2, 0, s7
-; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    s_sub_i32 s9, 0, s7
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
+; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
+; GFX6-NEXT:    s_add_i32 s5, s5, s9
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    s_add_i32 s4, s5, s9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    s_xor_b32 s4, s4, s9
+; GFX6-NEXT:    s_xor_b32 s4, s5, s9
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
@@ -6986,8 +6954,6 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s9, 0x4f7ffffe
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
 ; GFX9-NEXT:    s_ashr_i32 s6, s3, 31
@@ -7003,10 +6969,10 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    s_sub_i32 s8, 0, s3
 ; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_f32_e32 v0, s9, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s6
-; GFX9-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v0
 ; GFX9-NEXT:    s_sub_i32 s8, 0, s2
@@ -7022,6 +6988,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
@@ -7042,6 +7009,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s7, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s7, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -9047,9 +9015,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
-; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
-; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
-; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[12:13], s8
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s10
@@ -9060,41 +9025,40 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    s_xor_b64 s[12:13], s[8:9], s[14:15]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
 ; GFX6-NEXT:    s_sub_u32 s10, 0, s12
 ; GFX6-NEXT:    s_subb_u32 s11, 0, s13
-; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_ashr_i32 s16, s5, 31
+; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
 ; GFX6-NEXT:    s_add_u32 s0, s4, s16
-; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
+; GFX6-NEXT:    s_mov_b32 s17, s16
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
+; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s17, s16
 ; GFX6-NEXT:    s_addc_u32 s1, s5, s16
+; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[16:17]
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v0
-; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[16:17]
+; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v0
+; GFX6-NEXT:    s_xor_b64 s[14:15], s[16:17], s[14:15]
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
-; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX6-NEXT:    s_xor_b64 s[14:15], s[16:17], s[14:15]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
@@ -9176,16 +9140,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
-; GFX6-NEXT:    v_mac_f32_e32 v8, s18, v9
+; GFX6-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GFX6-NEXT:    v_rcp_f32_e32 v3, v8
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v3
-; GFX6-NEXT:    v_mul_f32_e32 v4, s20, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v4
+; GFX6-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    s_sub_u32 s0, 0, s2
@@ -9309,9 +9273,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
-; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
-; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
@@ -9322,17 +9283,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
 ; GFX9-NEXT:    s_sub_u32 s2, 0, s8
 ; GFX9-NEXT:    s_subb_u32 s3, 0, s9
-; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
+; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s15, s14
-; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
+; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
@@ -9343,16 +9303,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -9437,16 +9397,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
-; GFX9-NEXT:    v_mac_f32_e32 v9, s16, v10
+; GFX9-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v9
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
-; GFX9-NEXT:    v_mul_f32_e32 v4, s17, v4
-; GFX9-NEXT:    v_mul_f32_e32 v5, s18, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v4, s19, v5
+; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s10
@@ -10220,10 +10180,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
-; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
-; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s10
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
 ; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
@@ -10233,42 +10191,40 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s17
-; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
 ; GFX6-NEXT:    s_sub_u32 s2, 0, s16
 ; GFX6-NEXT:    s_subb_u32 s3, 0, s17
-; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_ashr_i32 s12, s5, 31
+; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
 ; GFX6-NEXT:    s_add_u32 s0, s4, s12
-; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
+; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
+; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    s_addc_u32 s1, s5, s12
+; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[12:13]
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v0
-; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[12:13]
+; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
-; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
@@ -10343,7 +10299,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s5
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
-; GFX6-NEXT:    v_mac_f32_e32 v6, s18, v7
+; GFX6-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v6, v6
@@ -10353,10 +10309,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v6
-; GFX6-NEXT:    v_mul_f32_e32 v4, s20, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v6
+; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v4
+; GFX6-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    s_sub_u32 s0, 0, s4
@@ -10478,9 +10434,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
-; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
-; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
@@ -10491,17 +10444,16 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
 ; GFX9-NEXT:    s_sub_u32 s2, 0, s12
 ; GFX9-NEXT:    s_subb_u32 s3, 0, s13
-; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
+; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
@@ -10512,16 +10464,16 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -10600,7 +10552,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s15
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v6, v2, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
-; GFX9-NEXT:    v_mac_f32_e32 v7, s16, v8
+; GFX9-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
@@ -10610,10 +10562,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
-; GFX9-NEXT:    v_mul_f32_e32 v4, s17, v7
-; GFX9-NEXT:    v_mul_f32_e32 v5, s18, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v7
+; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v4, s19, v5
+; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s10

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
index c2d520e4fe14f..e5de201bedf8e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
@@ -33,9 +33,8 @@ define i32 @f(i32 %x, i32 %y) {
 ; GCN-LABEL: f:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0xffff80
-; GCN-NEXT:    v_or_b32_e32 v0, s4, v0
-; GCN-NEXT:    v_or_b32_e32 v1, s4, v1
+; GCN-NEXT:    v_or_b32_e32 v0, 0xffff80, v0
+; GCN-NEXT:    v_or_b32_e32 v1, 0xffff80, v1
 ; GCN-NEXT:    v_mul_i32_i24_e32 v0, v0, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 14, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index e82f2c6ea9171..78aa2cc01c6d5 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -275,12 +275,10 @@ define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrsp
 ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
 ; SI-DAG: buffer_load_dwordx2 v[[[LO0:[0-9]+]]:[[HI0:[0-9]+]]]
 ; SI-DAG: buffer_load_dwordx2 v[[[LO1:[0-9]+]]:[[HI1:[0-9]+]]]
-; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
-; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
-; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
-; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]]
-; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]]
-; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, v[[LO0]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, v[[HI0]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, v[[LO1]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, v[[HI1]]
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
 define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {

diff  --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 9e7bf1ed0f7f8..784336a05da1c 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -518,7 +518,6 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
 ; SI-NEXT:    v_alignbit_b32 v4, v1, v1, 8
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
-; SI-NEXT:    s_mov_b32 s5, 0xffff0000
 ; SI-NEXT:    v_alignbit_b32 v5, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    v_alignbit_b32 v6, v3, v3, 8
@@ -529,9 +528,9 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
 ; SI-NEXT:    v_bfi_b32 v0, s4, v0, v5
 ; SI-NEXT:    v_bfi_b32 v3, s4, v3, v6
 ; SI-NEXT:    v_bfi_b32 v2, s4, v2, v7
-; SI-NEXT:    v_and_b32_e32 v4, s5, v1
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v3, s5, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 2660c3162b107..fd2c131aaa717 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -250,25 +250,23 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s12, 0xff00
-; SI-NEXT:    s_movk_i32 s13, 0xff
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
-; SI-NEXT:    v_and_b32_e32 v2, s12, v0
-; SI-NEXT:    v_and_b32_e32 v4, s12, v1
-; SI-NEXT:    v_and_b32_e32 v3, s13, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xff00, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v1, s13, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -283,18 +281,15 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_movk_i32 s12, 0xff00
-; VI-NEXT:    s_movk_i32 s13, 0xff
-; VI-NEXT:    s_movk_i32 s14, 0x900
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
 ; VI-NEXT:    s_mov_b32 s11, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
@@ -303,16 +298,16 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT:    v_and_b32_e32 v4, s12, v1
+; VI-NEXT:    v_and_b32_e32 v4, 0xffffff00, v1
 ; VI-NEXT:    v_add_u16_e32 v1, 9, v1
 ; VI-NEXT:    v_add_u16_e32 v3, 9, v0
-; VI-NEXT:    v_and_b32_e32 v1, s13, v1
-; VI-NEXT:    v_and_b32_e32 v2, s12, v0
-; VI-NEXT:    v_and_b32_e32 v3, s13, v3
+; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; VI-NEXT:    v_and_b32_e32 v2, 0xffffff00, v0
+; VI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    v_add_u16_e32 v1, s14, v1
-; VI-NEXT:    v_add_u16_e32 v2, s14, v2
+; VI-NEXT:    v_add_u16_e32 v1, 0x900, v1
+; VI-NEXT:    v_add_u16_e32 v2, 0x900, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -340,8 +335,6 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; SI-NEXT:    s_mov_b32 s16, 0xff00
-; SI-NEXT:    s_movk_i32 s17, 0xff
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s8, s0
@@ -353,12 +346,12 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
-; SI-NEXT:    v_and_b32_e32 v2, s16, v0
-; SI-NEXT:    v_and_b32_e32 v4, s16, v1
-; SI-NEXT:    v_and_b32_e32 v3, s17, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xff00, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v1, s17, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -374,17 +367,14 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_movk_i32 s16, 0xff00
-; VI-NEXT:    s_movk_i32 s17, 0xff
-; VI-NEXT:    s_movk_i32 s18, 0x900
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s11, 0xf000
-; VI-NEXT:    s_mov_b32 s10, -1
-; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
@@ -394,16 +384,16 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT:    v_and_b32_e32 v4, s16, v1
+; VI-NEXT:    v_and_b32_e32 v4, 0xffffff00, v1
 ; VI-NEXT:    v_add_u16_e32 v1, 9, v1
 ; VI-NEXT:    v_add_u16_e32 v3, 9, v0
-; VI-NEXT:    v_and_b32_e32 v1, s17, v1
-; VI-NEXT:    v_and_b32_e32 v2, s16, v0
-; VI-NEXT:    v_and_b32_e32 v3, s17, v3
+; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; VI-NEXT:    v_and_b32_e32 v2, 0xffffff00, v0
+; VI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    v_add_u16_e32 v1, s18, v1
-; VI-NEXT:    v_add_u16_e32 v2, s18, v2
+; VI-NEXT:    v_add_u16_e32 v1, 0x900, v1
+; VI-NEXT:    v_add_u16_e32 v2, 0x900, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index e4a1177f85203..60bcb6d4bae36 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -409,12 +409,11 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, s4, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v3, s4, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
@@ -433,7 +432,6 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s4, 0xffff
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -444,8 +442,8 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; VI-NEXT:    v_and_b32_e32 v1, s4, v1
-; VI-NEXT:    v_and_b32_e32 v0, s4, v0
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
@@ -535,16 +533,15 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, s4, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v5, s4, v1
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v6, s4, v2
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v7, s4, v3
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
@@ -571,7 +568,6 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s4, 0xffff
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -584,10 +580,10 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; VI-NEXT:    v_and_b32_e32 v3, s4, v3
-; VI-NEXT:    v_and_b32_e32 v2, s4, v2
-; VI-NEXT:    v_and_b32_e32 v1, s4, v1
-; VI-NEXT:    v_and_b32_e32 v0, s4, v0
+; VI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
@@ -718,25 +714,24 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v8, s4, v0
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v9, s4, v1
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v10, s4, v2
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v11, s4, v3
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, s4, v4
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v13, s4, v5
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v14, s4, v6
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v15, s4, v7
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
@@ -780,7 +775,6 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s4, 0xffff
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
@@ -796,10 +790,10 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
-; VI-NEXT:    v_and_b32_e32 v3, s4, v3
-; VI-NEXT:    v_and_b32_e32 v2, s4, v2
-; VI-NEXT:    v_and_b32_e32 v1, s4, v1
-; VI-NEXT:    v_and_b32_e32 v0, s4, v0
+; VI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
@@ -809,10 +803,10 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    v_bcnt_u32_b32 v9, v9, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v10, v10, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v11, v11, 0
-; VI-NEXT:    v_and_b32_e32 v7, s4, v7
-; VI-NEXT:    v_and_b32_e32 v6, s4, v6
-; VI-NEXT:    v_and_b32_e32 v5, s4, v5
-; VI-NEXT:    v_and_b32_e32 v4, s4, v4
+; VI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 05b13a985935b..d388355d8e713 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1172,7 +1172,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_movk_i32 s8, 0xff
 ; SI-NEXT:    s_mov_b32 s6, s2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
@@ -1186,11 +1185,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, s8, v4
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v4
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
 ; SI-NEXT:    v_or_b32_e32 v0, v7, v0
-; SI-NEXT:    v_and_b32_e32 v2, s8, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1297,7 +1296,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v0, s4, v0
+; GFX9-NEXT:    v_add_u16_e32 v0, 0x900, v0
 ; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index c3262b4671ed2..2c9b0322f57e0 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -116,14 +116,13 @@ define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspac
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v3, s4, v0
+; GFX9-NEXT:    v_or_b32_e32 v3, 0xffff8000, v0
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
@@ -261,13 +260,12 @@ define <4 x i16> @extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrsp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v2, s4, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0xffff8000, v0
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, v4, v2
+; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 1af86dda48501..8881d1d8b23c6 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -136,8 +136,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
 }
 
 ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
-; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000
-; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}}
+; GCN: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+$}}
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
@@ -152,7 +151,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrs
 }
 
 ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
-; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
 ; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
 ; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
 ; GCN-NOT: v_mul

diff  --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll
index 50e9533890b77..f8dc21272f8f0 100644
--- a/llvm/test/CodeGen/AMDGPU/fexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fexp.ll
@@ -18,9 +18,8 @@ define <2 x float> @v_exp_v2f32(<2 x float> %arg0) {
 ; GCN-LABEL: v_exp_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GCN-NEXT:    v_mul_f32_e32 v0, s4, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-NEXT:    v_exp_f32_e32 v1, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -32,10 +31,9 @@ define <3 x float> @v_exp_v3f32(<3 x float> %arg0) {
 ; GCN-LABEL: v_exp_v3f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GCN-NEXT:    v_mul_f32_e32 v0, s4, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-NEXT:    v_exp_f32_e32 v1, v1
 ; GCN-NEXT:    v_exp_f32_e32 v2, v2
@@ -48,11 +46,10 @@ define <4 x float> @v_exp_v4f32(<4 x float> %arg0) {
 ; GCN-LABEL: v_exp_v4f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GCN-NEXT:    v_mul_f32_e32 v0, s4, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
-; GCN-NEXT:    v_mul_f32_e32 v3, s4, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-NEXT:    v_exp_f32_e32 v1, v1
 ; GCN-NEXT:    v_exp_f32_e32 v2, v2
@@ -95,11 +92,10 @@ define <2 x half> @v_exp_v2f16(<2 x half> %arg0) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, s4, v0
-; SI-NEXT:    v_mul_f32_e32 v1, s4, v1
+; SI-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
 ; SI-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -141,15 +137,14 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, s4, v0
-; SI-NEXT:    v_mul_f32_e32 v1, s4, v1
-; SI-NEXT:    v_mul_f32_e32 v2, s4, v2
-; SI-NEXT:    v_mul_f32_e32 v3, s4, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
 ; SI-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-NEXT:    v_exp_f32_e32 v2, v2
@@ -159,11 +154,10 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
 ; VI-LABEL: v_exp_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_movk_i32 s4, 0x3dc5
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3dc5
-; VI-NEXT:    v_mul_f16_e32 v2, s4, v1
+; VI-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_mul_f16_e32 v4, s4, v0
+; VI-NEXT:    v_mul_f16_e32 v4, 0x3dc5, v0
 ; VI-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_exp_f16_e32 v2, v2
 ; VI-NEXT:    v_exp_f16_e32 v4, v4
@@ -177,9 +171,9 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX9-NEXT:    v_mul_f16_e32 v2, s4, v1
+; GFX9-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v1
 ; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_mul_f16_e32 v3, s4, v0
+; GFX9-NEXT:    v_mul_f16_e32 v3, 0x3dc5, v0
 ; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_exp_f16_e32 v2, v2
 ; GFX9-NEXT:    v_exp_f16_e32 v3, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 81ff336ef3210..95e809711e481 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -515,13 +515,12 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -548,16 +547,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
-; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 4, v0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 4, v0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
@@ -1367,12 +1365,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x104
-; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -1402,18 +1399,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
 ; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
-; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
@@ -2254,12 +2250,11 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4004
-; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -2289,18 +2284,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
 ; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
-; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index 49667f7008db8..bf4de4c993025 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -241,9 +241,8 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
@@ -369,11 +368,10 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SAFE-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v0, v4, v0
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v2, v4, v2
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v3, v4, v3
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v0, v8, 16, v0
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v2, v12, 16, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index a0096b01c659d..9f68a859f89c0 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -242,9 +242,8 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
@@ -370,11 +369,10 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SAFE-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v0, v4, v0
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v2, v4, v2
-; GFX9-SAFE-NEXT:    v_and_b32_e32 v3, v4, v3
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-SAFE-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v0, v8, 16, v0
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
 ; GFX9-SAFE-NEXT:    v_lshl_or_b32 v2, v12, 16, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index 24a6d72f1bd88..ae13928c3e298 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -161,13 +161,12 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl
 }
 
 ; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
-; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}}
 ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[SIGNBIT]], [[ADD]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
 
 ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
index 75d5a73347f54..344f394a64412 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
@@ -170,9 +170,8 @@ body:             |
 # operands
 
 # CHECK-LABEL: name: add_f32_1.0_multi_f16_use
-# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F16_e32 killed %11, %13, implicit $mode, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 killed %12, killed %13, implicit $mode, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 1065353216, killed %11, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, killed %12, implicit $mode, implicit $exec
 
 
 name:            add_f32_1.0_multi_f16_use
@@ -306,9 +305,8 @@ body:             |
 #  constant, and not folded as a multi-use literal for the f16 cases
 
 # CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
-# CHECK: %14:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32  %11, %14, implicit $mode, implicit $exec
-# CHECK: %16:vgpr_32 = V_ADD_F16_e32 %12,  %14, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, %11, implicit $mode, implicit $exec
+# CHECK: %16:vgpr_32 = V_ADD_F16_e32 1065353216, %12, implicit $mode, implicit $exec
 # CHECK: %17:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit $mode, implicit $exec
 
 name:            add_f32_1.0_one_f32_use_multi_f16_use
@@ -511,9 +509,8 @@ body:             |
 #  constant, and not folded as a multi-use literal for the f16 cases
 
 # CHECK-LABEL: name: add_f16_1.0_multi_f32_use
-# CHECK: %13:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F32_e32 15360, %11, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F32_e32 15360, %12, implicit $mode, implicit $exec
 
 name:            add_f16_1.0_multi_f32_use
 alignment:       1
@@ -575,12 +572,10 @@ body:             |
 ---
 
 # The low 16-bits are an inline immediate, but the high bits are junk
-# FIXME: Should be able to fold this
 
 # CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
-# CHECK: %13:vgpr_32 = V_MOV_B32_e32 80886784, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F16_e32 %11, %13, implicit $mode, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 80886784, %11, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 80886784, %12, implicit $mode, implicit $exec
 
 name:            add_f16_1.0_other_high_bits_multi_f16_use
 alignment:       1
@@ -641,13 +636,9 @@ body:             |
 ...
 ---
 
-# FIXME: Should fold inline immediate into f16 and literal use into
-# f32 instruction.
-
 # CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
-# CHECK: %13:vgpr_32 = V_MOV_B32_e32 305413120, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F32_e32 305413120, %11, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 305413120, %12, implicit $mode, implicit $exec
 name:            add_f16_1.0_other_high_bits_use_f16_f32
 alignment:       1
 exposesReturnsTwice: false

diff  --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index 6f0416ff9ebf0..cea7622471203 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -242,19 +242,18 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %ou
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s2, 0x2f800000
-; SI-NEXT:    s_mov_b32 s3, 0xcf800000
+; SI-NEXT:    s_mov_b32 s2, 0xcf800000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_trunc_f32_e32 v0, s1
 ; SI-NEXT:    v_trunc_f32_e32 v2, s0
-; SI-NEXT:    v_mul_f32_e32 v1, s2, v0
-; SI-NEXT:    v_mul_f32_e32 v3, s2, v2
+; SI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; SI-NEXT:    v_floor_f32_e32 v4, v1
 ; SI-NEXT:    v_floor_f32_e32 v5, v3
 ; SI-NEXT:    v_cvt_u32_f32_e32 v3, v4
 ; SI-NEXT:    v_cvt_u32_f32_e32 v1, v5
-; SI-NEXT:    v_fma_f32 v0, v4, s3, v0
-; SI-NEXT:    v_fma_f32 v4, v5, s3, v2
+; SI-NEXT:    v_fma_f32 v0, v4, s2, v0
+; SI-NEXT:    v_fma_f32 v4, v5, s2, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -264,12 +263,11 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s4, 0x2f800000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_trunc_f32_e32 v0, s3
 ; VI-NEXT:    v_trunc_f32_e32 v4, s2
-; VI-NEXT:    v_mul_f32_e32 v1, s4, v0
-; VI-NEXT:    v_mul_f32_e32 v2, s4, v4
+; VI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; VI-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v4
 ; VI-NEXT:    v_floor_f32_e32 v5, v1
 ; VI-NEXT:    s_mov_b32 s2, 0xcf800000
 ; VI-NEXT:    v_floor_f32_e32 v6, v2
@@ -379,29 +377,28 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %ou
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s8, 0x2f800000
-; SI-NEXT:    s_mov_b32 s9, 0xcf800000
+; SI-NEXT:    s_mov_b32 s8, 0xcf800000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_trunc_f32_e32 v0, s1
 ; SI-NEXT:    v_trunc_f32_e32 v2, s0
 ; SI-NEXT:    v_trunc_f32_e32 v4, s3
 ; SI-NEXT:    v_trunc_f32_e32 v6, s2
-; SI-NEXT:    v_mul_f32_e32 v1, s8, v0
-; SI-NEXT:    v_mul_f32_e32 v3, s8, v2
-; SI-NEXT:    v_mul_f32_e32 v5, s8, v4
-; SI-NEXT:    v_mul_f32_e32 v7, s8, v6
+; SI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; SI-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; SI-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
 ; SI-NEXT:    v_floor_f32_e32 v8, v1
 ; SI-NEXT:    v_floor_f32_e32 v9, v3
 ; SI-NEXT:    v_floor_f32_e32 v10, v5
 ; SI-NEXT:    v_floor_f32_e32 v11, v7
 ; SI-NEXT:    v_cvt_u32_f32_e32 v3, v8
 ; SI-NEXT:    v_cvt_u32_f32_e32 v1, v9
-; SI-NEXT:    v_fma_f32 v0, v8, s9, v0
-; SI-NEXT:    v_fma_f32 v8, v9, s9, v2
+; SI-NEXT:    v_fma_f32 v0, v8, s8, v0
+; SI-NEXT:    v_fma_f32 v8, v9, s8, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v7, v10
 ; SI-NEXT:    v_cvt_u32_f32_e32 v5, v11
-; SI-NEXT:    v_fma_f32 v4, v10, s9, v4
-; SI-NEXT:    v_fma_f32 v9, v11, s9, v6
+; SI-NEXT:    v_fma_f32 v4, v10, s8, v4
+; SI-NEXT:    v_fma_f32 v9, v11, s8, v6
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v8
 ; SI-NEXT:    v_cvt_u32_f32_e32 v6, v4
@@ -414,34 +411,33 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s2, 0x2f800000
-; VI-NEXT:    s_mov_b32 s3, 0xcf800000
+; VI-NEXT:    s_mov_b32 s2, 0xcf800000
+; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_trunc_f32_e32 v0, s5
 ; VI-NEXT:    v_trunc_f32_e32 v4, s4
-; VI-NEXT:    v_mul_f32_e32 v1, s2, v0
-; VI-NEXT:    v_mul_f32_e32 v2, s2, v4
+; VI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; VI-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v4
 ; VI-NEXT:    v_floor_f32_e32 v5, v1
 ; VI-NEXT:    v_floor_f32_e32 v6, v2
-; VI-NEXT:    v_fma_f32 v0, v5, s3, v0
+; VI-NEXT:    v_fma_f32 v0, v5, s2, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; VI-NEXT:    v_fma_f32 v0, v6, s3, v4
+; VI-NEXT:    v_fma_f32 v0, v6, s2, v4
 ; VI-NEXT:    v_trunc_f32_e32 v4, s7
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v5
-; VI-NEXT:    v_mul_f32_e32 v5, s2, v4
+; VI-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; VI-NEXT:    v_trunc_f32_e32 v8, s6
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v6
 ; VI-NEXT:    v_floor_f32_e32 v6, v5
-; VI-NEXT:    v_mul_f32_e32 v5, s2, v8
+; VI-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v8
 ; VI-NEXT:    v_floor_f32_e32 v9, v5
-; VI-NEXT:    v_fma_f32 v4, v6, s3, v4
+; VI-NEXT:    v_fma_f32 v4, v6, s2, v4
 ; VI-NEXT:    v_cvt_u32_f32_e32 v7, v6
 ; VI-NEXT:    v_cvt_u32_f32_e32 v6, v4
-; VI-NEXT:    v_fma_f32 v4, v9, s3, v8
+; VI-NEXT:    v_fma_f32 v4, v9, s2, v8
 ; VI-NEXT:    v_cvt_u32_f32_e32 v5, v9
 ; VI-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index bf783cba5ace7..fe20b153a6cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -479,14 +479,13 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 7b17b54becf38..c10143c928b23 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -2619,8 +2619,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 ; SI-NEXT:    v_and_b32_e32 v10, v8, v10
 ; SI-NEXT:    v_not_b32_e32 v11, v11
 ; SI-NEXT:    v_and_b32_e32 v11, v9, v11
-; SI-NEXT:    s_brev_b32 s8, 1
-; SI-NEXT:    v_and_b32_e32 v13, s8, v9
+; SI-NEXT:    v_and_b32_e32 v13, 0x80000000, v9
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v12
@@ -2650,7 +2649,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 ; SI-NEXT:    v_and_b32_e32 v8, v6, v8
 ; SI-NEXT:    v_not_b32_e32 v9, v9
 ; SI-NEXT:    v_and_b32_e32 v9, v7, v9
-; SI-NEXT:    v_and_b32_e32 v11, s8, v7
+; SI-NEXT:    v_and_b32_e32 v11, 0x80000000, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index a3058fc1f4441..39904b73cb42b 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -697,10 +697,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -737,14 +736,13 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; SI-NEXT:    v_or_b32_e32 v4, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v3, v4
-; SI-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v3, 16, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v0, s4, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_and_b32_e32 v2, s4, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -844,12 +842,11 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
 ; SI-NEXT:    v_or_b32_e32 v4, 16, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; SI-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v2, s4, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v0, s4, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
@@ -911,10 +908,9 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
-; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1164,16 +1160,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; SI-LABEL: v_fshr_v2i24:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, 0xffffff
-; SI-NEXT:    v_and_b32_e32 v6, s4, v4
-; SI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
-; SI-NEXT:    v_mul_hi_u32 v6, v6, s5
-; SI-NEXT:    v_and_b32_e32 v7, s4, v5
+; SI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
+; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; SI-NEXT:    v_mul_hi_u32 v6, v6, s4
+; SI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
 ; SI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
-; SI-NEXT:    v_mul_hi_u32 v6, v7, s5
+; SI-NEXT:    v_mul_hi_u32 v6, v7, s4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
@@ -1187,16 +1182,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; VI-LABEL: v_fshr_v2i24:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, 0xffffff
-; VI-NEXT:    v_and_b32_e32 v6, s4, v4
-; VI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
-; VI-NEXT:    v_mul_hi_u32 v6, v6, s5
-; VI-NEXT:    v_and_b32_e32 v7, s4, v5
+; VI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
+; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; VI-NEXT:    v_mul_hi_u32 v6, v6, s4
+; VI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
 ; VI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
-; VI-NEXT:    v_mul_hi_u32 v6, v7, s5
+; VI-NEXT:    v_mul_hi_u32 v6, v7, s4
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
 ; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
@@ -1210,16 +1204,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX9-LABEL: v_fshr_v2i24:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v6, s4, v4
-; GFX9-NEXT:    s_mov_b32 s5, 0xaaaaaaab
-; GFX9-NEXT:    v_mul_hi_u32 v6, v6, s5
-; GFX9-NEXT:    v_and_b32_e32 v7, s4, v5
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
+; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; GFX9-NEXT:    v_mul_hi_u32 v6, v6, s4
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, v7, s5
+; GFX9-NEXT:    v_mul_hi_u32 v6, v7, s4
 ; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 1b4a181f7e127..b68c1d158b939 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -24,17 +24,16 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s5
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -44,7 +43,6 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -54,15 +52,15 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s3
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -163,19 +161,18 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -577,17 +574,16 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s5
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -597,7 +593,6 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -607,15 +602,15 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s3
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -627,20 +622,19 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s0, v2
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
 ; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NODL-NEXT:    s_endpgm
@@ -998,17 +992,16 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s5
+; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -1018,7 +1011,6 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1028,15 +1020,15 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v1, v[2:3]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, s2, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v0, v1, v0, s3
+; GFX8-NEXT:    v_mad_u32_u24 v0, v1, v0, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1134,17 +1126,16 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s5
+; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -1154,7 +1145,6 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
@@ -1168,15 +1158,15 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, s3
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1275,17 +1265,16 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s5
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -1295,7 +1284,6 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1305,15 +1293,15 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s3
+; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1418,17 +1406,16 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s5
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -1438,7 +1425,6 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1448,15 +1434,15 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s3
+; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1560,17 +1546,16 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, s5
+; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v2, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -1580,7 +1565,6 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1590,15 +1574,15 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s3
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1702,17 +1686,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s5
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1723,7 +1706,6 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1733,15 +1715,15 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s3
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -2006,17 +1988,16 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v4, v0, v2, s5
+; GFX7-NEXT:    v_mad_u32_u24 v4, v0, v2, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2027,7 +2008,6 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -2037,15 +2017,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v4, v2, v1, s3
+; GFX8-NEXT:    v_mad_u32_u24 v4, v2, v1, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v4
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -2058,20 +2038,19 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s0, v2
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v2, v4, v3
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v3, s1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v3, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v2
 ; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NODL-NEXT:    s_endpgm
@@ -2081,20 +2060,19 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, s0, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
 ; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v4, v3
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v4, v3, s1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v4, v3, s0
 ; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
@@ -2324,17 +2302,16 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v4, v3, v1, s5
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v4, v3, v1, s4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2345,7 +2322,6 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -2355,15 +2331,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v4, v0, v3, s3
+; GFX8-NEXT:    v_mad_u32_u24 v4, v0, v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v4
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -2638,13 +2614,12 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 94dfc6dd88b7c..b44d8bd9276e5 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -188,28 +188,27 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v6, v0, 0, 8
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-NEXT:    v_bfe_i32 v7, v0, 8, 8
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX7-NEXT:    v_bfe_i32 v8, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -418,12 +417,11 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
@@ -964,34 +962,33 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
 ; GFX7-NEXT:    v_bfe_i32 v4, v2, 0, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_i32 v7, v0, 0, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v7
 ; GFX7-NEXT:    v_bfe_i32 v8, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v4
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v8, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v7, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 228c17bb5e80a..693d5f62975fb 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -21,17 +21,16 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, s5
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, s4
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
@@ -47,7 +46,6 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -57,17 +55,17 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
 ; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s3
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
 ; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
@@ -189,12 +187,11 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
@@ -226,17 +223,16 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_and_b32_e32 v6, s0, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
-; GFX8-NEXT:    v_and_b32_e32 v8, s0, v8
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v7, s0, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xff, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX8-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v4, v6, v7, v4
 ; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -259,13 +255,13 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s0, v1
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v5, s0, v2
+; GFX9-NODL-NEXT:    v_and_b32_e32 v5, 0xff, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s0, v6
-; GFX9-NODL-NEXT:    v_and_b32_e32 v7, s0, v7
+; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX9-NODL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
 ; GFX9-NODL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -290,13 +286,13 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, s0, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, s0, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xff, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v6, s0, v6
-; GFX9-DL-NEXT:    v_and_b32_e32 v7, s0, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
 ; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -401,12 +397,11 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
@@ -585,12 +580,11 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
@@ -727,12 +721,11 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
@@ -911,13 +904,12 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
 ; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
@@ -1092,17 +1084,16 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v8, v1, v5, s5
+; GFX7-NEXT:    v_mad_u32_u24 v8, v1, v5, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, v8
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
@@ -1119,7 +1110,6 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1129,17 +1119,17 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
 ; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v8, v1, v2, s3
+; GFX8-NEXT:    v_mad_u32_u24 v8, v1, v2, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
@@ -1156,22 +1146,21 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s0, v2
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v3, v4, s1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, v3, v2
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
 ; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -1182,22 +1171,21 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, s0, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v2
 ; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v3, v4, s1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
 ; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
 ; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -1287,24 +1275,23 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, s5
+; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, s4
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, s5, v3
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, s4, v3
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1315,7 +1302,6 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1325,22 +1311,22 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, s3
+; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, s2
 ; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s3, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s2, v4
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1484,15 +1470,14 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v6, v0, 0, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
 ; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
@@ -1523,16 +1508,15 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
-; GFX8-NEXT:    v_and_b32_e32 v8, s0, v8
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX8-NEXT:    v_bfe_i32 v6, v3, 0, 8
 ; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX8-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GFX8-NEXT:    v_bfe_i32 v7, v2, 0, 8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v4, v8, v9, v4
@@ -1559,8 +1543,8 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s0, v6
-; GFX9-NODL-NEXT:    v_and_b32_e32 v7, s0, v7
+; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX9-NODL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
 ; GFX9-NODL-NEXT:    v_bfe_i32 v5, v2, 0, 8
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
@@ -1590,8 +1574,8 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v6, s0, v6
-; GFX9-DL-NEXT:    v_and_b32_e32 v7, s0, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX9-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
 ; GFX9-DL-NEXT:    v_bfe_i32 v5, v2, 0, 8
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -1697,21 +1681,20 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s5
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v7, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v5, v0
@@ -1723,7 +1706,6 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1733,19 +1715,19 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 8
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 8, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 8
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s3
+; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v7, v0
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v6, v0
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v2, v0
@@ -1867,22 +1849,20 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_mov_b32 s4, 0xff00
-; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, s5, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff00, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff00, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, s5, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
 ; GFX7-NEXT:    v_or_b32_e32 v4, v7, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, s5, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s5, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
 ; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
@@ -1913,17 +1893,16 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v3
 ; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 8, v2
 ; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
 ; GFX8-NEXT:    v_mad_u16 v2, v7, v9, v2
@@ -1956,14 +1935,14 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v5, 16, v1
-; GFX9-NODL-NEXT:    v_and_b32_e32 v10, v4, v10
-; GFX9-NODL-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xffff, v10
+; GFX9-NODL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, v8, 16, v10
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, v6, 16, v9
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
-; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
+; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v5, v4
 ; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v2
 ; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1994,14 +1973,14 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v5, 16, v1
-; GFX9-DL-NEXT:    v_and_b32_e32 v10, v4, v10
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xffff, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v8, 16, v10
-; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
+; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
+; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v6, 16, v9
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v5, v4
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -2094,12 +2073,11 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
 ; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v8, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index eacbd6f0e2e60..fafabd98bbd87 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -344,49 +344,48 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 4
 ; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
-; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
-; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
 ; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
-; GFX7-NEXT:    v_and_b32_e32 v12, s4, v12
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
 ; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
-; GFX7-NEXT:    v_and_b32_e32 v13, s4, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
 ; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
-; GFX7-NEXT:    v_and_b32_e32 v14, s4, v14
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v14
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
 ; GFX7-NEXT:    v_bfe_i32 v9, v2, 24, 4
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
-; GFX7-NEXT:    v_and_b32_e32 v15, s4, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
-; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT:    v_and_b32_e32 v16, s4, v16
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -917,49 +916,48 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 4
 ; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
-; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
-; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
 ; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
-; GFX7-NEXT:    v_and_b32_e32 v12, s4, v12
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
 ; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
-; GFX7-NEXT:    v_and_b32_e32 v13, s4, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v13
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
 ; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
-; GFX7-NEXT:    v_and_b32_e32 v14, s4, v14
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v14
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
 ; GFX7-NEXT:    v_bfe_i32 v9, v2, 24, 4
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
-; GFX7-NEXT:    v_and_b32_e32 v15, s4, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v15
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
-; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT:    v_and_b32_e32 v16, s4, v16
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xff, v16
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -2204,7 +2202,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 20, 4
@@ -2212,9 +2209,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_i32 v5, v2, 4, 4
 ; GFX7-NEXT:    v_bfe_i32 v6, v2, 0, 4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v10, v0, 20, 4
 ; GFX7-NEXT:    v_bfe_i32 v11, v0, 16, 4
@@ -2223,19 +2220,19 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v11
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v11, s4, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v13
 ; GFX7-NEXT:    v_bfe_i32 v14, v0, 24, 4
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v16, 28, v0
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
 ; GFX7-NEXT:    v_or_b32_e32 v6, v11, v10
-; GFX7-NEXT:    v_and_b32_e32 v12, s4, v14
-; GFX7-NEXT:    v_and_b32_e32 v14, s4, v16
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-NEXT:    v_bfe_i32 v8, v2, 8, 4
 ; GFX7-NEXT:    v_bfe_i32 v15, v0, 8, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2243,22 +2240,22 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_i32 v7, v2, 24, 4
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 28, v2
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 12, 4
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 12, 4
-; GFX7-NEXT:    v_and_b32_e32 v13, s4, v15
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v15
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v16, v11, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v13, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v5, v0
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v10, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v12, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v14, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -2845,8 +2842,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
-; GFX7-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 28, v2
@@ -2858,13 +2853,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_i32 v9, v2, 4, 4
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v11, 28, v0
 ; GFX7-NEXT:    v_bfe_i32 v12, v0, 24, 4
@@ -2879,30 +2874,30 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_or_b32_e32 v6, v8, v7
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v11
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v12
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v13
-; GFX7-NEXT:    v_and_b32_e32 v10, s4, v14
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v15
-; GFX7-NEXT:    v_and_b32_e32 v13, s4, v16
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v16
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v17
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, s5, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX7-NEXT:    v_or_b32_e32 v7, v8, v7
 ; GFX7-NEXT:    v_or_b32_e32 v8, v10, v9
 ; GFX7-NEXT:    v_or_b32_e32 v9, v13, v12
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v2, s5, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v13, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v6, s5, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v2
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v8
 ; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2914,8 +2909,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v14, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v9, s4, v4
-; GFX7-NEXT:    v_and_b32_e32 v15, s4, v5
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v4
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v5
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v12, v0
 ; GFX7-NEXT:    v_bfe_u32 v10, v4, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v16, v5, 8, 8
@@ -2923,8 +2918,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v10, v16, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v11, v0
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 7994f83fa4313..f9d7f537f2953 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -2119,7 +2119,6 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT:    s_mov_b32 s4, 0xf0000
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_u32 v8, v2, 20, 4
@@ -2130,12 +2129,12 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 4
 ; GFX7-NEXT:    v_and_b32_e32 v7, 15, v2
 ; GFX7-NEXT:    v_alignbit_b32 v2, v8, v2, 16
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xf0000, v9
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 12, v0
 ; GFX7-NEXT:    v_and_b32_e32 v14, 15, v0
 ; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xf0000, v9
 ; GFX7-NEXT:    v_or_b32_e32 v8, v14, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX7-NEXT:    v_and_b32_e32 v7, 15, v7
@@ -2226,55 +2225,54 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v6, 15, v1
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 16, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v13, 15, v2
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 16, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 28, v2
+; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
+; GFX9-NEXT:    v_bfe_u32 v4, v1, 4, 4
+; GFX9-NEXT:    v_bfe_u32 v6, v1, 12, 4
+; GFX9-NEXT:    v_bfe_u32 v11, v2, 4, 4
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 4
+; GFX9-NEXT:    v_bfe_u32 v14, v2, 8, 4
+; GFX9-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX9-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
+; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX9-NEXT:    v_bfe_u32 v8, v1, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v13, v2, 12, 4
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v15, v2, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v16, v2, 16, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 24, 4
-; GFX9-NEXT:    v_and_b32_e32 v2, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX9-NEXT:    v_and_b32_e32 v17, v4, v17
-; GFX9-NEXT:    v_and_b32_e32 v10, v4, v10
-; GFX9-NEXT:    v_and_b32_e32 v15, v4, v15
-; GFX9-NEXT:    v_and_b32_e32 v8, v4, v8
-; GFX9-NEXT:    v_and_b32_e32 v13, v4, v13
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v6
-; GFX9-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
-; GFX9-NEXT:    v_lshl_or_b32 v8, v12, 16, v13
-; GFX9-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
-; GFX9-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
-; GFX9-NEXT:    v_lshl_or_b32 v10, v14, 16, v15
+; GFX9-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; GFX9-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v7, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
 ; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v2, v18, 16, v2
-; GFX9-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
-; GFX9-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
+; GFX9-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
+; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
+; GFX9-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
 ; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v9, v6
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
 ; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
 ; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -2293,55 +2291,54 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v6, 15, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 16, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v13, 15, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 16, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v18, 28, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
+; GFX9-DL-NEXT:    v_bfe_u32 v4, v1, 4, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v11, v2, 4, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 16, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 8, 4
+; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
+; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 12, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX9-DL-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 16, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
 ; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, v4, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX9-DL-NEXT:    v_and_b32_e32 v17, v4, v17
-; GFX9-DL-NEXT:    v_and_b32_e32 v10, v4, v10
-; GFX9-DL-NEXT:    v_and_b32_e32 v15, v4, v15
-; GFX9-DL-NEXT:    v_and_b32_e32 v8, v4, v8
-; GFX9-DL-NEXT:    v_and_b32_e32 v13, v4, v13
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, v4, v6
-; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
-; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v12, 16, v13
-; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
-; GFX9-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
-; GFX9-DL-NEXT:    v_lshl_or_b32 v10, v14, 16, v15
+; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; GFX9-DL-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v7, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v18, 16, v2
-; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
-; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
+; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
+; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
+; GFX9-DL-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v9, v6
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -2479,8 +2476,6 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT:    s_movk_i32 s4, 0xf00
-; GFX7-NEXT:    s_movk_i32 s5, 0xf0f
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 4, v2
@@ -2492,27 +2487,27 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 4, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xf00, v8
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xf00, v4
 ; GFX7-NEXT:    v_and_b32_e32 v5, 15, v2
 ; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 4
 ; GFX7-NEXT:    v_and_b32_e32 v12, 15, v0
 ; GFX7-NEXT:    v_bfe_u32 v14, v0, 16, 4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
 ; GFX7-NEXT:    v_alignbit_b32 v2, v6, v2, 24
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v9
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xf00, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    v_alignbit_b32 v0, v13, v0, 24
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v11
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xf00, v11
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v15
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v9
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xf00, v15
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xf00, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xf0f, v0
 ; GFX7-NEXT:    v_or_b32_e32 v8, v10, v8
-; GFX7-NEXT:    v_and_b32_e32 v2, s5, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xf0f, v2
 ; GFX7-NEXT:    v_or_b32_e32 v4, v14, v4
 ; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
 ; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
@@ -2980,55 +2975,54 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v6, 15, v1
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 16, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v13, 15, v2
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 16, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 28, v2
+; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
+; GFX9-NEXT:    v_bfe_u32 v4, v1, 4, 4
+; GFX9-NEXT:    v_bfe_u32 v6, v1, 12, 4
+; GFX9-NEXT:    v_bfe_u32 v11, v2, 4, 4
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 4
+; GFX9-NEXT:    v_bfe_u32 v14, v2, 8, 4
+; GFX9-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX9-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
+; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX9-NEXT:    v_bfe_u32 v8, v1, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v13, v2, 12, 4
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v15, v2, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v16, v2, 16, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 24, 4
-; GFX9-NEXT:    v_and_b32_e32 v2, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX9-NEXT:    v_and_b32_e32 v17, v4, v17
-; GFX9-NEXT:    v_and_b32_e32 v10, v4, v10
-; GFX9-NEXT:    v_and_b32_e32 v15, v4, v15
-; GFX9-NEXT:    v_and_b32_e32 v8, v4, v8
-; GFX9-NEXT:    v_and_b32_e32 v13, v4, v13
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v6
-; GFX9-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
-; GFX9-NEXT:    v_lshl_or_b32 v8, v12, 16, v13
-; GFX9-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
-; GFX9-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
-; GFX9-NEXT:    v_lshl_or_b32 v10, v14, 16, v15
+; GFX9-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; GFX9-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v7, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
 ; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v2, v18, 16, v2
-; GFX9-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
-; GFX9-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
+; GFX9-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
+; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
+; GFX9-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
 ; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v9, v6
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
 ; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
 ; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -3048,55 +3042,54 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v6, 15, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 16, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v13, 15, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 16, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v18, 28, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
+; GFX9-DL-NEXT:    v_bfe_u32 v4, v1, 4, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v11, v2, 4, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 16, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 8, 4
+; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
+; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 12, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX9-DL-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 16, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
 ; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, v4, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX9-DL-NEXT:    v_and_b32_e32 v17, v4, v17
-; GFX9-DL-NEXT:    v_and_b32_e32 v10, v4, v10
-; GFX9-DL-NEXT:    v_and_b32_e32 v15, v4, v15
-; GFX9-DL-NEXT:    v_and_b32_e32 v8, v4, v8
-; GFX9-DL-NEXT:    v_and_b32_e32 v13, v4, v13
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, v4, v6
-; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
-; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v12, 16, v13
-; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
-; GFX9-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
-; GFX9-DL-NEXT:    v_lshl_or_b32 v10, v14, 16, v15
+; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; GFX9-DL-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v7, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v18, 16, v2
-; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
-; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
+; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
+; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
+; GFX9-DL-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v9, v6
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 1b13cc34f3ef8..9fadaee4def9f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1304,13 +1304,12 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_movk_i32 s4, 0xff
-; SI-NEXT:    s_lshr_b32 s5, s11, 8
+; SI-NEXT:    s_lshr_b32 s4, s11, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    v_and_b32_e32 v1, s4, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_cmp_lg_u32 s6, 13
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s6, 12
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
@@ -1318,28 +1317,27 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_and_b32_e32 v2, s4, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    s_mov_b32 s5, 0xffff
-; SI-NEXT:    s_lshr_b32 s7, s10, 24
+; SI-NEXT:    s_lshr_b32 s4, s10, 24
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_cmp_lg_u32 s6, 11
 ; SI-NEXT:    v_or_b32_e32 v3, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s7, s10, 16
+; SI-NEXT:    s_lshr_b32 s4, s10, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 10
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_lshr_b32 s7, s10, 8
+; SI-NEXT:    s_lshr_b32 s4, s10, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    v_and_b32_e32 v1, s4, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_cmp_lg_u32 s6, 9
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s6, 8
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
@@ -1347,27 +1345,27 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_and_b32_e32 v2, s4, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    s_lshr_b32 s7, s9, 24
+; SI-NEXT:    s_lshr_b32 s4, s9, 24
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_cmp_lg_u32 s6, 7
 ; SI-NEXT:    v_or_b32_e32 v2, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s7, s9, 16
+; SI-NEXT:    s_lshr_b32 s4, s9, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 6
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_lshr_b32 s7, s9, 8
+; SI-NEXT:    s_lshr_b32 s4, s9, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    v_and_b32_e32 v1, s4, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_cmp_lg_u32 s6, 5
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s6, 4
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
@@ -1375,27 +1373,27 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_and_b32_e32 v4, s4, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; SI-NEXT:    v_or_b32_e32 v1, v4, v1
-; SI-NEXT:    s_lshr_b32 s7, s8, 24
+; SI-NEXT:    s_lshr_b32 s4, s8, 24
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_cmp_lg_u32 s6, 3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s7, s8, 16
+; SI-NEXT:    s_lshr_b32 s4, s8, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
-; SI-NEXT:    s_lshr_b32 s7, s8, 8
+; SI-NEXT:    s_lshr_b32 s4, s8, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    v_and_b32_e32 v4, s4, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; SI-NEXT:    s_cmp_lg_u32 s6, 1
 ; SI-NEXT:    v_or_b32_e32 v0, v4, v0
-; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
@@ -1403,10 +1401,10 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT:    v_and_b32_e32 v5, s4, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v4, s5, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; SI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index ee401344580d6..8da0afa42c8ee 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1881,47 +1881,46 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v5, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, v5, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
-; GFX9-NEXT:    v_lshl_or_b32 v3, v7, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v5, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v6, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v5, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
-; GFX9-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v8, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index 7367f9c315661..89a7288091040 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -553,10 +553,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
 ; GFX9-LABEL: sample_d_2d:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v4, v6, v4
-; GFX9-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
@@ -585,13 +584,12 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v6
+; GFX9-NEXT:    v_lshl_or_b32 v11, v7, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-NEXT:    v_and_b32_e32 v5, v2, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, v2, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX9-NEXT:    v_lshl_or_b32 v11, v7, 16, v5
-; GFX9-NEXT:    v_lshl_or_b32 v9, v4, 16, v3
+; GFX9-NEXT:    v_lshl_or_b32 v9, v4, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
 ; GFX9-NEXT:    image_sample_d v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -636,13 +634,12 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
 ; GFX9-LABEL: sample_c_d_2d:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v9, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v6, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v9, v7
-; GFX9-NEXT:    v_and_b32_e32 v1, v9, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v8, 16, v1
 ; GFX9-NEXT:    image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
@@ -689,10 +686,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
 ; GFX9-LABEL: sample_d_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v4, v7, v4
-; GFX9-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v3, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
@@ -742,12 +738,11 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v5, v0, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, v0, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshl_or_b32 v10, v6, 16, v5
-; GFX9-NEXT:    v_lshl_or_b32 v9, v4, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX9-NEXT:    v_lshl_or_b32 v10, v6, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX9-NEXT:    v_lshl_or_b32 v9, v4, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
 ; GFX9-NEXT:    image_sample_c_d_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -789,10 +784,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
 ; GFX9-LABEL: sample_cd_2d:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v4, v6, v4
-; GFX9-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
@@ -836,13 +830,12 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
 ; GFX9-LABEL: sample_c_cd_2d:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v9, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v6, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v9, v7
-; GFX9-NEXT:    v_and_b32_e32 v1, v9, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v8, 16, v1
 ; GFX9-NEXT:    image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
@@ -889,10 +882,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
 ; GFX9-LABEL: sample_cd_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v4, v7, v4
-; GFX9-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v3, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
@@ -942,12 +934,11 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v5, v0, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, v0, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshl_or_b32 v10, v6, 16, v5
-; GFX9-NEXT:    v_lshl_or_b32 v9, v4, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX9-NEXT:    v_lshl_or_b32 v10, v6, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX9-NEXT:    v_lshl_or_b32 v9, v4, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
 ; GFX9-NEXT:    image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1136,13 +1127,12 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v13, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GFX9-NEXT:    v_lshl_or_b32 v12, v7, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-NEXT:    v_lshl_or_b32 v11, v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, v0, v6
-; GFX9-NEXT:    v_lshl_or_b32 v12, v7, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_lshl_or_b32 v11, v5, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v10, v3, 16, v0
 ; GFX9-NEXT:    image_sample_c_d_o v0, v[8:13], s[0:7], s[8:11] dmask:0x4 a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1172,13 +1162,12 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v13, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GFX9-NEXT:    v_lshl_or_b32 v12, v7, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-NEXT:    v_lshl_or_b32 v11, v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, v0, v6
-; GFX9-NEXT:    v_lshl_or_b32 v12, v7, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_lshl_or_b32 v11, v5, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v10, v3, 16, v0
 ; GFX9-NEXT:    image_sample_c_d_o v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 33b388829afb3..03b78c511183b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -84,16 +84,15 @@ define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
 ; GFX6-NEXT:    s_mov_b32 s8, s2
 ; GFX6-NEXT:    s_mov_b32 s9, s3
 ; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX6-NEXT:    s_mov_b32 s2, 0x3e22f983
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x3e22f983, v1
 ; GFX6-NEXT:    v_fract_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x3e22f983, v0
 ; GFX6-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cos_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cos_f32_e32 v1, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
index e3572de13faef..4a14e556c7066 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
@@ -32,25 +32,22 @@ entry:
 ; SI:     buffer_load_dword v[[A_F16_0:[0-9]+]]
 ; VI:     flat_load_dword v[[A_F16_0:[0-9]+]]
 ; GFX9:   global_load_dword v[[A_F16_0:[0-9]+]]
-; SI:     s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218
-; GFX9:   s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c
 ; VI:     v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c
 ; SI:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
 ; SI:     v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]]
 ; SI:     v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
 ; SI:     v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:     v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:     v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]]
+; SI:     v_mul_f32_e32 v[[R_F32_5:[0-9]+]], 0x3f317218, v[[R_F32_0]]
 ; SI:     v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]]
-; SI:     v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]]
+; SI:     v_mul_f32_e32 v[[R_F32_6:[0-9]+]], 0x3f317218, v[[R_F32_1]]
 ; SI:     v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]]
 ; GFX9:   v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]]
 ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI:     v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
 ; VI:     v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9:   v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
-; VI:     v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]]
-; GFX9:   v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
+; GFX9:   v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x398c, v[[R_F16_2]]
+; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]]
 ; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
 ; SI-NOT: v_and_b32_e32
 ; SI:     v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index f99fa13bc01e5..7c6ef1c08077c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -32,10 +32,9 @@ entry:
 ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
 define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %res = call <2 x float> @llvm.log.v2f32(<2 x float> %in)
@@ -67,14 +66,13 @@ entry:
 ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
 define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %res = call <4 x float> @llvm.log.v4f32(<4 x float> %in)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
index d5cb21cfd0a76..c50552aacefeb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
@@ -32,25 +32,23 @@ entry:
 ; SI:     buffer_load_dword v[[A_F16_0:[0-9]+]]
 ; VI:     flat_load_dword v[[A_F16_0:[0-9]+]]
 ; GFX9:   global_load_dword v[[A_F16_0:[0-9]+]]
-; SI:     s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a
-; GFX9:   s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1
 ; VI:     v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1
 ; SI:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
 ; SI:     v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]]
 ; SI:     v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
 ; SI:     v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:     v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:     v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]]
+; SI:     v_mul_f32_e32 v[[R_F32_5:[0-9]+]], 0x3e9a209a, v[[R_F32_0]]
 ; SI:     v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]]
-; SI:     v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]]
+; SI:     v_mul_f32_e32 v[[R_F32_6:[0-9]+]], 0x3e9a209a, v[[R_F32_1]]
 ; SI:     v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]]
 ; GFX9:   v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]]
 ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI:     v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
 ; VI:     v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI:     v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]]
-; GFX9:   v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
-; GFX9:   v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
+; GFX9:   v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_2]]
+; GFX9:   v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x34d1, v[[R_F16_0]]
 ; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
 ; SI-NOT: v_and_b32_e32
 ; SI:     v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index bef1b264934d7..e2e55a412b8cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -32,10 +32,9 @@ entry:
 ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}}
 define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %res = call <2 x float> @llvm.log10.v2f32(<2 x float> %in)
@@ -67,14 +66,13 @@ entry:
 ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}}
 define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %res = call <4 x float> @llvm.log10.v4f32(<4 x float> %in)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 7b047d7615cfb..3fcb1d925cb3d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -84,16 +84,15 @@ define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
 ; GFX6-NEXT:    s_mov_b32 s8, s2
 ; GFX6-NEXT:    s_mov_b32 s9, s3
 ; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX6-NEXT:    s_mov_b32 s2, 0x3e22f983
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x3e22f983, v1
 ; GFX6-NEXT:    v_fract_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x3e22f983, v0
 ; GFX6-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX6-NEXT:    v_sin_f32_e32 v0, v0
 ; GFX6-NEXT:    v_sin_f32_e32 v1, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 6c4071354cc82..381b19ba6b8c1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1357,13 +1357,12 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s2, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s2, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -1375,13 +1374,12 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
-; GCN-HSA-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s1
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s2, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s2, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GCN-HSA-NEXT:    flat_store_dwordx3 v[5:6], v[0:2]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
@@ -1396,13 +1394,12 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s2, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s2, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -1587,14 +1584,13 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s2, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s2, v4
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -1605,14 +1601,13 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GCN-HSA-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s1
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s2, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s2, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
@@ -1627,14 +1622,13 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s2, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s2, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -1838,7 +1832,6 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
@@ -1846,10 +1839,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, s2, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s2, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, s2, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s2, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -1857,7 +1850,6 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1871,12 +1863,12 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, s4, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s4, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s4, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s4, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -1892,18 +1884,17 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, s2, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s2, v2
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, s2, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s2, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
@@ -2142,7 +2133,6 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
@@ -2154,16 +2144,16 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, s6, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s6, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, s6, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s6, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s6, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s6, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s6, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s6, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v4
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
@@ -2173,7 +2163,6 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2198,23 +2187,23 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, s4, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s4, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, s4, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s4, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, s4, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v6
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, s4, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v1, s4, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, s4, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[1:4]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
@@ -2233,27 +2222,26 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, s6, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v7
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s6, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, s6, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s6, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, s6, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s6, v2
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s6, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s6, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
@@ -2622,7 +2610,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2640,34 +2627,34 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s0, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s0, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, s0, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, s0, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v0
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, s0, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, s0, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s0, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s0, v4
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, s0, v11
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s0, v10
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, s0, v9
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, s0, v8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v11
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v10
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v9
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xffff, v8
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, s0, v15
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s0, v14
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, s0, v13
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, s0, v12
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v15
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v13
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xffff, v12
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
@@ -2683,7 +2670,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s14, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2722,8 +2708,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, s14, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, s14, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
@@ -2731,31 +2717,31 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, s14, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, s14, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, s14, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, s14, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s14, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s14, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v8
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, s14, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, s14, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v10
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
@@ -2763,16 +2749,16 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v15
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, s14, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, s14, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, s14, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s14, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, s14, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v2
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, s14, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[7:10]
@@ -2793,45 +2779,42 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, s0, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s0, v2
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v14
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xffff, v13
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s0, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, s0, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, 0xffff, v7
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, s0, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, s0, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s0, v4
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, s0, v11
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, 0xffff, v11
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, s0, v10
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v10
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, s0, v9
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v9
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s0, v8
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, s0, v15
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v14
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, s0, v13
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s0, v12
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
@@ -3494,7 +3477,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
@@ -3504,16 +3486,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s0, v15
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s0, v14
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v15
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v5, s0, v13
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v5, 0xffff, v13
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, s0, v12
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xffff, v12
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
@@ -3521,57 +3503,57 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s0, v19
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v19
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, s0, v18
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, s0, v17
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s0, v16
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v17
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v16
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v26
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v25
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v24
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s0, v27
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s0, v26
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, s0, v25
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, s0, v24
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v27
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v26
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v25
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v24
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v31
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v30
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v28
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, s0, v31
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, s0, v30
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v31
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v30
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v33, s0, v29
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, s0, v28
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v33, 0xffff, v29
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, 0xffff, v28
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v38
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v37
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v35
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, s0, v38
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, s0, v37
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, s0, v36
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v47, s0, v35
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, 0xffff, v38
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, 0xffff, v37
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, 0xffff, v36
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v47, 0xffff, v35
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v42
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v40
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v39
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v37, s0, v42
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v35, s0, v41
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v37, 0xffff, v42
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v35, 0xffff, v41
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, s0, v40
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v51, s0, v39
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, 0xffff, v40
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v51, 0xffff, v39
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
@@ -3579,19 +3561,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v56
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, s0, v58
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, s0, v57
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, s0, v56
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s0, v55
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, 0xffff, v58
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, 0xffff, v57
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v56
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v55
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v42
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v41
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v39
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, s0, v42
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s0, v41
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s0, v40
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s0, v39
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v42
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v41
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v40
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v39
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
@@ -3632,7 +3614,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s12, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3676,8 +3657,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, s12, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v24, s12, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[0:1]
@@ -3695,16 +3676,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xa0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, s12, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v24, s12, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
 ; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
@@ -3714,8 +3695,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
@@ -3724,15 +3705,15 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v8
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
@@ -3743,30 +3724,30 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v14
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v13
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s12, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v15
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s12, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v14
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v19
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v18
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; GCN-HSA-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[5:6], v[1:4]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
@@ -3781,43 +3762,43 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v33
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, s12, v33
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, s12, v32
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v33
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v32
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v21
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v20
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v20
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s12, v23
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s12, v22
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v23
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v22
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v35
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, s12, v35
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s12, v34
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v35
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v34
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v29
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v28
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s12, v29
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s12, v28
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v29
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v28
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v31
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v30
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v31
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v30
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -3842,76 +3823,75 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v62, s4, v11
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v62, 0xffff, v11
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v61, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v60, s4, v10
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v60, 0xffff, v10
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, s4, v9
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v9
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s4, v8
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s4, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v7
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s4, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, s4, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s4, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, s4, v27
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, 0xffff, v27
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, s4, v26
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, 0xffff, v26
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, s4, v25
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, 0xffff, v25
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, s4, v24
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v31
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, s4, v31
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, 0xffff, v31
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v30
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v40, s4, v30
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v40, 0xffff, v30
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, s4, v29
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, 0xffff, v29
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s4, v28
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v35
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v46, s4, v35
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v46, 0xffff, v35
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v34
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, s4, v34
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, 0xffff, v34
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v33
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, s4, v33
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, 0xffff, v33
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, s4, v32
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, 0xffff, v32
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v23
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, s4, v23
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, 0xffff, v23
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, s4, v22
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, 0xffff, v22
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, s4, v21
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, 0xffff, v21
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, s4, v20
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, s4, v19
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, 0xffff, v19
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v52, s4, v18
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v52, 0xffff, v18
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, s4, v17
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v17
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s4, v16
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v58, s4, v15
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v58, 0xffff, v15
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v14
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v56, s4, v14
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v56, 0xffff, v14
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, s4, v13
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xffff, v13
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s4, v12
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
@@ -5761,7 +5741,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -5770,8 +5749,8 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s2, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s2, v9
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v9
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -5779,7 +5758,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5796,9 +5774,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s4, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v9
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s4, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v8
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -5814,7 +5792,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
@@ -5822,9 +5799,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s2, v9
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v9
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s2, v8
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v8
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -6065,15 +6042,14 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, 0
@@ -6083,30 +6059,30 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s12, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s12, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s12, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s12, v3
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v12
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6121,24 +6097,23 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, s4, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v3
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[11:14]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, v12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v12
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, s4, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, s4, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, s4, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
@@ -6156,7 +6131,6 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
@@ -6168,13 +6142,13 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v17
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s6, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s6, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s6, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s6, v2
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
@@ -6524,27 +6498,26 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s0, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s0, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s0, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s0, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v5
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, s0, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s0, v4
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, s0, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, s0, v5
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, 0xffff, v5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v20
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v20
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, v20
@@ -6577,10 +6550,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s6, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6598,29 +6571,28 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v1
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v5
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v7
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
@@ -6628,13 +6600,13 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, s6, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 0x60
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, s6, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v3
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[1:2], v[7:10]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v8
@@ -6642,10 +6614,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s6, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s6, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s2
@@ -6667,9 +6639,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v30, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, v28
@@ -6681,27 +6653,26 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s0, v0
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s0, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s0, v3
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, s0, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, s0, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, s0, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v27, s0, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v27, 0xffff, v7
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v4
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v28
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v28
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96
@@ -7301,7 +7272,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
@@ -7316,48 +7286,48 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s0, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v2
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, s0, v4
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, 0xffff, v4
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s0, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, s0, v5
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v5
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, s0, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, s0, v8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v38, s0, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v38, 0xffff, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v9
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v42, s0, v9
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v42, 0xffff, v9
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v10
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v46, s0, v10
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v46, 0xffff, v10
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, s0, v12
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v12
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v50, s0, v11
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v50, 0xffff, v11
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v13
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v54, s0, v13
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v54, 0xffff, v13
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v14
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, s0, v14
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, 0xffff, v14
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v16
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s0, v16
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v16
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v15
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s0, v15
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s0, v17
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v15
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v1
@@ -7439,7 +7409,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s18, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
@@ -7482,53 +7451,53 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v3
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s13
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s12
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s15
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s14
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v20
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[5:8]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s11
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v18
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v18
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s10
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[5:8]
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, s18, v16
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v16
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[5:8]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
@@ -7539,26 +7508,26 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v19
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v19
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[5:8]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v17
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[5:8]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[5:8]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s18, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
@@ -7566,7 +7535,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, s18, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -7575,7 +7544,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, s18, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -7603,39 +7572,27 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v57, 0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v20, 0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v9, s0, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v45, 0xffff, v33
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, s0, v36
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, 0xffff, v36
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, s0, v38
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, 0xffff, v38
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v38
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, s0, v37
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, 0xffff, v37
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v37, 0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v13, s0, v4
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v17, s0, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s0, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v21, s0, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v25, s0, v8
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v39, s0, v32
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v32
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, s0, v31
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, s0, v34
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v45, s0, v33
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v51, s0, v35
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v55, v37
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v51, 0xffff, v35
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v35
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v52, v37
@@ -7647,7 +7604,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v46, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, 0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v7
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, 0xffff, v31
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v31
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v43, v37
@@ -7655,9 +7614,15 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v37
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v9, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v17, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v25, 0xffff, v8
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v39, 0xffff, v32
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v32
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, 0xffff, v34
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v33, v37
@@ -7668,10 +7633,13 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v26, v37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v37
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v21, 0xffff, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144
@@ -7680,6 +7648,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v22, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v37
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80

diff  --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 6ad048b4e2fae..50a6dcf9b19bf 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -485,13 +485,12 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b32 s4, 0xffff
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; CI-NEXT:    v_and_b32_e32 v0, s4, v0
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; CI-NEXT:    v_and_b32_e32 v1, s4, v1
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
@@ -573,13 +572,12 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b32 s4, 0xff00ff
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; CI-NEXT:    v_and_b32_e32 v3, s4, v3
-; CI-NEXT:    v_and_b32_e32 v2, s4, v2
+; CI-NEXT:    v_and_b32_e32 v3, 0xff00ff, v3
+; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff, v2
 ; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; CI-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll
index 6dffa25e4cf1a..28c7831dd341b 100644
--- a/llvm/test/CodeGen/AMDGPU/madmk.ll
+++ b/llvm/test/CodeGen/AMDGPU/madmk.ll
@@ -31,9 +31,8 @@ define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float add
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
-; GCN-DAG: v_mac_f32_e32 [[VB]], [[SK]], [[VA]]
-; GCN-DAG: v_mac_f32_e32 [[VC]], [[SK]], [[VA]]
+; GCN-DAG: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
+; GCN-DAG: v_mac_f32_e32 [[VC]], 0x41200000, [[VA]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

diff  --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
index bc42cc7fd0f80..6cdff0047d13f 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
@@ -3,9 +3,8 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
 
 ; GCN-LABEL: {{^}}v_mul_i16:
-; SI: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
-; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]]
-; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]]
+; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
 ; SI: v_mul_u32_u24
 
 ; GFX89: v_mul_lo_u16_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index f625977b6854d..ae490e1bfa6a6 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -824,13 +824,12 @@ define i17 @test_umul24_anyextend_i23_src0_src1(i23 %a, i23 %b) {
 ; GCN-LABEL: test_umul24_anyextend_i23_src0_src1:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0x7fffff
-; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
-; GCN-NEXT:    v_and_b32_e32 v1, s4, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0xea, v0
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, 0x39b, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x7ffffe, v0
-; GCN-NEXT:    v_and_b32_e32 v1, s4, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1fffe, v0
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0x63, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
index 643a872f76e29..008856beef90e 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -89,11 +89,10 @@ entry:
   ret void
 }
 ; CHECK-LABEL: {{^}}vector_imm:
-; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64
-; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
-; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
-; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
-; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
 
 define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) #1 {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index b66b9cc355163..43f044f89db51 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -50,8 +50,8 @@ define amdgpu_kernel void @fadd_v32_vs(<32 x float> addrspace(1)* %a, <32 x floa
 }
 
 ; GCN-LABEL: {{^}}fadd_v2_v_imm:
-; GCN:            s_mov_b32 s[[K:[0-9]+]], 0x42c80000
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}}
+; GFX90A:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
+; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
 define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -261,8 +261,8 @@ define amdgpu_kernel void @fmul_v32_vs(<32 x float> addrspace(1)* %a, <32 x floa
 }
 
 ; GCN-LABEL: {{^}}fmul_v2_v_imm:
-; GCN:            s_mov_b32 s[[K:[0-9]+]], 0x42c80000
-; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}}
+; GFX90A:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
+; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
 ; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
 define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -552,8 +552,7 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}fneg_v2f32_vec:
-; GFX900:         s_brev_b32 [[SIGN:s[0-9]+]], 1
-; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, [[SIGN]], v{{[0-9]+}}
+; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}}
 define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
index 0b4e05c6d0f2f..28ee9637ef9c0 100644
--- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -2,9 +2,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}s_movk_i32_k0:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
@@ -16,9 +15,8 @@ define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k1:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x7fff, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
@@ -32,7 +30,7 @@ define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-LABEL: {{^}}s_movk_i32_k2:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x7fff, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
@@ -44,9 +42,8 @@ define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k3:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x8000, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
@@ -58,9 +55,8 @@ define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k4:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x20000, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
@@ -72,11 +68,9 @@ define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k5:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffffef, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xff00ffff, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -87,9 +81,8 @@ define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k6:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x41, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
@@ -101,11 +94,9 @@ define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k7:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}}
-; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x2000, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x4000, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -116,11 +107,9 @@ define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k8:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8000, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -131,11 +120,9 @@ define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k9:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8001, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -146,11 +133,9 @@ define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k10:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8888, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -161,11 +146,9 @@ define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k11:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8fff, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -176,11 +159,9 @@ define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(
 }
 
 ; SI-LABEL: {{^}}s_movk_i32_k12:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
 ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff7001, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 8d77d86cb278e..3a89892e8905c 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -134,13 +134,11 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT:    s_movk_i32 s5, 0x8000
-; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
-; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
+; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
+; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
+; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
+; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -156,15 +154,14 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
-; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -198,18 +195,16 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT:    s_movk_i32 s5, 0x8000
-; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
+; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
-; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
+; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
+; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
+; GFX6-NEXT:    v_min_i32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_max_i32_e32 v3, s5, v2
+; GFX6-NEXT:    v_max_i32_e32 v3, 0xffff8000, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
@@ -224,22 +219,21 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
-; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -273,29 +267,26 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT:    s_movk_i32 s5, 0x8000
-; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
-; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
+; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
+; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
+; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
+; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v7
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
-; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_max_i32_e32 v2, s5, v2
+; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
+; GFX6-NEXT:    v_min_i32_e32 v2, 0x7fff, v2
+; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
+; GFX6-NEXT:    v_max_i32_e32 v2, 0xffff8000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -308,15 +299,14 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
-; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -327,14 +317,14 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -368,15 +358,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT:    s_brev_b32 s6, 1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -388,15 +377,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT:    s_brev_b32 s6, 1
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index a0421e5adc73b..c0c609a24f844 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -401,7 +401,6 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GCN-NEXT:    s_mov_b32 s8, s2
 ; GCN-NEXT:    s_mov_b32 s9, s3
 ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -421,8 +420,8 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 0, v3
-; GCN-NEXT:    v_mul_f32_e32 v5, s2, v5
-; GCN-NEXT:    v_mul_f32_e32 v7, s2, v7
+; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GCN-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
@@ -475,7 +474,6 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; TONGA-NEXT:    s_mov_b32 s8, s2
 ; TONGA-NEXT:    s_mov_b32 s9, s3
 ; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; TONGA-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; TONGA-NEXT:    s_mov_b32 s4, s0
 ; TONGA-NEXT:    s_mov_b32 s5, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
@@ -495,8 +493,8 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v3
-; TONGA-NEXT:    v_mul_f32_e32 v5, s2, v5
-; TONGA-NEXT:    v_mul_f32_e32 v7, s2, v7
+; TONGA-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; TONGA-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
@@ -549,7 +547,6 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GFX9-NEXT:    s_mov_b32 s8, s2
 ; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -567,8 +564,8 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; GFX9-NEXT:    v_mul_f32_e32 v6, s2, v6
-; GFX9-NEXT:    v_mul_f32_e32 v7, s2, v7
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX9-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v8
@@ -807,123 +804,122 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    s_mov_b32 s5, s3
 ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GCN-NEXT:    s_mov_b32 s8, s0
 ; GCN-NEXT:    s_mov_b32 s9, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
 ; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; GCN-NEXT:    v_xor_b32_e32 v5, v5, v11
-; GCN-NEXT:    v_xor_b32_e32 v15, v8, v9
-; GCN-NEXT:    v_xor_b32_e32 v4, v4, v9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v5
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GCN-NEXT:    v_xor_b32_e32 v4, v4, v9
+; GCN-NEXT:    v_xor_b32_e32 v15, v8, v9
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
+; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
 ; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GCN-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_mul_lo_u32 v9, v9, v8
+; GCN-NEXT:    v_xor_b32_e32 v5, v5, v11
 ; GCN-NEXT:    v_xor_b32_e32 v6, v6, v13
-; GCN-NEXT:    v_mul_f32_e32 v9, s2, v9
 ; GCN-NEXT:    v_xor_b32_e32 v16, v10, v11
+; GCN-NEXT:    v_mul_hi_u32 v9, v8, v9
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
+; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v5
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v11, v6
-; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v11, v11
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v8
 ; GCN-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; GCN-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v10
+; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v11
+; GCN-NEXT:    v_mul_lo_u32 v11, v8, v4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
 ; GCN-NEXT:    v_xor_b32_e32 v17, v12, v13
 ; GCN-NEXT:    v_xor_b32_e32 v2, v2, v12
-; GCN-NEXT:    v_mul_f32_e32 v8, s2, v8
 ; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
-; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v11, v11
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
+; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v9
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
-; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
-; GCN-NEXT:    v_mul_lo_u32 v10, v10, v8
-; GCN-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GCN-NEXT:    v_mul_f32_e32 v11, s2, v11
-; GCN-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GCN-NEXT:    v_mul_hi_u32 v10, v8, v10
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v6
-; GCN-NEXT:    v_mul_lo_u32 v12, v12, v11
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GCN-NEXT:    v_mul_hi_u32 v12, v11, v12
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v11, vcc, v0, v4
 ; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GCN-NEXT:    v_xor_b32_e32 v7, v7, v14
-; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v7
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GCN-NEXT:    v_mul_lo_u32 v12, v8, v4
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; GCN-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GCN-NEXT:    v_mul_hi_u32 v11, v2, v11
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v12, vcc, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
-; GCN-NEXT:    v_mul_f32_e32 v10, s2, v10
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v0, v9, v5
-; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v10
-; GCN-NEXT:    v_mul_lo_u32 v10, v11, v6
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0, v6
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, v10
+; GCN-NEXT:    v_xor_b32_e32 v4, v7, v14
+; GCN-NEXT:    v_mul_hi_u32 v7, v9, v12
+; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v4
+; GCN-NEXT:    v_mul_hi_u32 v0, v10, v0
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GCN-NEXT:    v_mul_hi_u32 v0, v2, v0
+; GCN-NEXT:    v_mul_lo_u32 v10, v7, v5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
+; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
+; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v11
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
-; GCN-NEXT:    v_sub_i32_e32 v9, vcc, v0, v5
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[2:3]
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
-; GCN-NEXT:    v_sub_i32_e32 v11, vcc, v2, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v10, v0, v10, s[4:5]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v2, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 1, v7
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, v8, v15
 ; GCN-NEXT:    v_xor_b32_e32 v5, v0, v16
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v15
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v16
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v7
-; GCN-NEXT:    v_mul_lo_u32 v5, v5, v4
-; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GCN-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s[4:5]
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v9, v12
+; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, v12, v5
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v8
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v10
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; GCN-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v8, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v7, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v2, v2, v17
-; GCN-NEXT:    v_mul_lo_u32 v5, v4, v7
+; GCN-NEXT:    v_mul_lo_u32 v6, v5, v4
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v17
-; GCN-NEXT:    v_xor_b32_e32 v6, v9, v14
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v3, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GCN-NEXT:    v_xor_b32_e32 v7, v8, v14
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -939,123 +935,122 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    s_mov_b32 s5, s3
 ; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; TONGA-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; TONGA-NEXT:    s_mov_b32 s8, s0
 ; TONGA-NEXT:    s_mov_b32 s9, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(1)
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v11, v5
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
-; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v11
-; TONGA-NEXT:    v_xor_b32_e32 v15, v8, v9
-; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v9
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v5
 ; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
+; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v9
+; TONGA-NEXT:    v_xor_b32_e32 v15, v8, v9
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v4
+; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v13, v6
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v11, v5
+; TONGA-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v13, v6
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
+; TONGA-NEXT:    v_mul_lo_u32 v9, v9, v8
+; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v11
 ; TONGA-NEXT:    v_xor_b32_e32 v6, v6, v13
-; TONGA-NEXT:    v_mul_f32_e32 v9, s2, v9
 ; TONGA-NEXT:    v_xor_b32_e32 v16, v10, v11
+; TONGA-NEXT:    v_mul_hi_u32 v9, v8, v9
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v5
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v6
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v11, v11
+; TONGA-NEXT:    v_mul_hi_u32 v8, v0, v8
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; TONGA-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v10
+; TONGA-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v11
+; TONGA-NEXT:    v_mul_lo_u32 v11, v8, v4
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v12, v2
 ; TONGA-NEXT:    v_xor_b32_e32 v17, v12, v13
 ; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v12
-; TONGA-NEXT:    v_mul_f32_e32 v8, s2, v8
 ; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v5
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v11, v11
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v11
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v9
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
-; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v4
-; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v8
-; TONGA-NEXT:    v_mul_hi_u32 v12, v9, v12
-; TONGA-NEXT:    v_mul_f32_e32 v11, s2, v11
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; TONGA-NEXT:    v_mul_hi_u32 v10, v8, v10
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v12, v9
-; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v6
-; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v11
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
-; TONGA-NEXT:    v_mul_hi_u32 v8, v0, v8
-; TONGA-NEXT:    v_mul_hi_u32 v12, v11, v12
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v8
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, v0, v4
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
 ; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v14, v7
-; TONGA-NEXT:    v_xor_b32_e32 v7, v7, v14
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v7
-; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v12, v11
-; TONGA-NEXT:    v_mul_lo_u32 v12, v8, v4
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; TONGA-NEXT:    v_mul_hi_u32 v9, v1, v9
-; TONGA-NEXT:    v_mul_hi_u32 v11, v2, v11
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v12
-; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
-; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v0, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
-; TONGA-NEXT:    v_mul_f32_e32 v10, s2, v10
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
-; TONGA-NEXT:    v_mul_lo_u32 v0, v9, v5
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v4, v10
-; TONGA-NEXT:    v_mul_lo_u32 v10, v11, v6
-; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v9
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, 0, v6
+; TONGA-NEXT:    v_mul_lo_u32 v0, v0, v10
+; TONGA-NEXT:    v_xor_b32_e32 v4, v7, v14
+; TONGA-NEXT:    v_mul_hi_u32 v7, v9, v12
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v4
+; TONGA-NEXT:    v_mul_hi_u32 v0, v10, v0
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v8
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
+; TONGA-NEXT:    v_mul_hi_u32 v7, v1, v7
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v10
+; TONGA-NEXT:    v_mul_hi_u32 v0, v2, v0
+; TONGA-NEXT:    v_mul_lo_u32 v10, v7, v5
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v12, v12
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v4
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v10
+; TONGA-NEXT:    v_mul_lo_u32 v10, v0, v6
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
+; TONGA-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v10
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v11
-; TONGA-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
-; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v0, v5
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v7
+; TONGA-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[2:3]
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v0
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; TONGA-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
-; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, v2, v6
-; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[2:3]
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v1
+; TONGA-NEXT:    v_cndmask_b32_e64 v10, v0, v10, s[4:5]
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v5
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v2, v6
+; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v7
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
+; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v8, v15
 ; TONGA-NEXT:    v_xor_b32_e32 v5, v0, v16
 ; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v15, v1
 ; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v16, v5
-; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v7
-; TONGA-NEXT:    v_mul_lo_u32 v5, v5, v4
-; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
-; TONGA-NEXT:    v_mul_hi_u32 v5, v4, v5
-; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v9
-; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s[4:5]
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v10
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
-; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
+; TONGA-NEXT:    v_mul_lo_u32 v5, v9, v12
+; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
+; TONGA-NEXT:    v_mul_hi_u32 v5, v12, v5
+; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v8
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v10
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v5, v12
+; TONGA-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; TONGA-NEXT:    v_cndmask_b32_e32 v2, v10, v8, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v2, v10, v7, vcc
 ; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v17
-; TONGA-NEXT:    v_mul_lo_u32 v5, v4, v7
+; TONGA-NEXT:    v_mul_lo_u32 v6, v5, v4
 ; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, v17, v2
-; TONGA-NEXT:    v_xor_b32_e32 v6, v9, v14
-; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v7
-; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v3, v7
-; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; TONGA-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v6
-; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; TONGA-NEXT:    v_xor_b32_e32 v7, v8, v14
+; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, v3, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
+; TONGA-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
+; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v7, v3
 ; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; TONGA-NEXT:    s_endpgm
 ;
@@ -1071,7 +1066,6 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GFX9-NEXT:    s_mov_b32 s5, s3
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX9-NEXT:    s_mov_b32 s8, s0
 ; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -1112,13 +1106,13 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v12
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v14
-; GFX9-NEXT:    v_mul_f32_e32 v8, s2, v8
+; GFX9-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX9-NEXT:    v_mul_f32_e32 v10, s2, v10
-; GFX9-NEXT:    v_mul_f32_e32 v12, s2, v12
+; GFX9-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; GFX9-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; GFX9-NEXT:    v_sub_u32_e32 v9, 0, v4
-; GFX9-NEXT:    v_mul_f32_e32 v14, s2, v14
+; GFX9-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v9, v8

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 6fec9f648034a..56719aee1acbd 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1794,7 +1794,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
@@ -1821,7 +1821,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_mul_f32_e32 v1, s3, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s3
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
@@ -1906,7 +1906,7 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v3, -v2, v1, s4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
@@ -1926,7 +1926,7 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v1, s4
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
@@ -1951,7 +1951,7 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v3, -v2, v1, s4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
@@ -1971,7 +1971,7 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v1, s4
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index ada9fe811185e..f7092e525adc5 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -507,9 +507,9 @@ store_label:
 
 ; Check that "pulling out" SDWA operands works correctly.
 ; GCN-LABEL: {{^}}pulled_out_test:
-; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, v{{[0-9]+}}
 ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, v{{[0-9]+}}
 ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_and_b32_sdwa

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index dbe7fb6cc0859..70dd26200712e 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -639,7 +639,6 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
@@ -650,8 +649,8 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, v9, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, v8, v6
-; SI-NEXT:    v_and_b32_e32 v1, s4, v1
-; SI-NEXT:    v_and_b32_e32 v0, s4, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index fb15854e1fb97..f6cecaf9c792f 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -490,7 +490,6 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b32 s4, 0xffff
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
@@ -501,8 +500,8 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, v9, v7
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, v8, v6
-; CI-NEXT:    v_and_b32_e32 v1, s4, v1
-; CI-NEXT:    v_and_b32_e32 v0, s4, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -555,7 +554,6 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s2, 0xff000000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -564,8 +562,8 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
 ; VI-NEXT:    v_lshlrev_b16_e32 v5, 8, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_and_b32_e32 v4, s2, v4
-; VI-NEXT:    v_and_b32_e32 v0, s2, v0
+; VI-NEXT:    v_and_b32_e32 v4, 0xff000000, v4
+; VI-NEXT:    v_and_b32_e32 v0, 0xff000000, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v5, v0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -581,14 +579,13 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b32 s4, 0xff00
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; CI-NEXT:    v_and_b32_e32 v4, s4, v4
+; CI-NEXT:    v_and_b32_e32 v4, 0xff00, v4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT:    v_and_b32_e32 v3, s4, v3
+; CI-NEXT:    v_and_b32_e32 v3, 0xff00, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; CI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 090ac93961235..3474235c3f5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -240,7 +240,6 @@ define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ad
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NEXT:    s_mov_b32 s2, 0xffff
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -255,9 +254,9 @@ define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ad
 ; SI-NEXT:    v_ashr_i32_e32 v0, v0, v6
 ; SI-NEXT:    v_ashr_i32_e32 v2, v4, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v3, s2, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v2, s2, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 9b24fc80e6f45..d4bccbc846c7d 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -64,8 +64,7 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
 ; CHECK-NEXT:    v_bfe_i32 v5, v0, 0, 31
 ; CHECK-NEXT:    s_mov_b32 s4, 0x38e38e39
 ; CHECK-NEXT:    s_mov_b32 s5, 0xc71c71c7
-; CHECK-NEXT:    s_brev_b32 s6, -2
-; CHECK-NEXT:    s_mov_b32 s7, 0x7ffffffd
+; CHECK-NEXT:    s_mov_b32 s6, 0x7ffffffd
 ; CHECK-NEXT:    v_mul_hi_i32 v5, v5, s4
 ; CHECK-NEXT:    v_mul_hi_i32 v4, v4, s4
 ; CHECK-NEXT:    v_mul_hi_i32 v3, v3, s5
@@ -84,12 +83,12 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT:    v_and_b32_e32 v2, s6, v2
-; CHECK-NEXT:    v_and_b32_e32 v1, s6, v1
-; CHECK-NEXT:    v_and_b32_e32 v0, s6, v0
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s6, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 1b486681037ce..c7cadbf49cb11 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1964,7 +1964,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s6
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
@@ -1991,7 +1991,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s6
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
@@ -2084,7 +2084,7 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v0
 ; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v4, -v2, v1, s4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
@@ -2106,7 +2106,7 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 30, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v4, -v2, v1, s4
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
@@ -2133,7 +2133,7 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v0
 ; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v4, -v2, v1, s4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
@@ -2155,7 +2155,7 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 30, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v4, -v2, v1, s4
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index d9bfe807892c1..69162f068677f 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -134,13 +134,11 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT:    s_movk_i32 s5, 0x8000
-; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
-; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
+; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
+; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
+; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
+; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -156,15 +154,14 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
-; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -198,21 +195,18 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT:    s_movk_i32 s5, 0x8000
-; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
+; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
-; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
+; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
+; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
+; GFX6-NEXT:    v_min_i32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
-; GFX6-NEXT:    v_max_i32_e32 v3, s5, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_max_i32_e32 v3, 0xffff8000, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s6, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -225,22 +219,21 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
-; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -274,29 +267,26 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
-; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT:    s_movk_i32 s5, 0x8000
-; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
-; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
+; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
+; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
+; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
+; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v3, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
-; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_max_i32_e32 v2, s5, v2
+; GFX6-NEXT:    v_min_i32_e32 v1, 0x7fff, v1
+; GFX6-NEXT:    v_min_i32_e32 v2, 0x7fff, v2
+; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
+; GFX6-NEXT:    v_max_i32_e32 v2, 0xffff8000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -309,15 +299,14 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
-; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -328,14 +317,14 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -369,15 +358,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT:    s_brev_b32 s6, 1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -389,15 +377,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT:    s_brev_b32 s6, 1
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -428,22 +415,21 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX6-NEXT:    s_brev_b32 s6, 1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v2, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -455,22 +441,21 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX8-NEXT:    s_brev_b32 s6, 1
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v1, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v2, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -503,29 +488,28 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; GFX6-NEXT:    s_brev_b32 s6, 1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -537,29 +521,28 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; GFX8-NEXT:    s_brev_b32 s6, 1
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v1, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v2, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -594,57 +577,56 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v8
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
-; GFX6-NEXT:    s_brev_b32 s6, 1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v1, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v4, s6, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v5, s6, v5
+; GFX6-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v6, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v6, s6, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v7, s6, v7
+; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -656,57 +638,56 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v0, v8
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
-; GFX8-NEXT:    s_brev_b32 s6, 1
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v1, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v2, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v3, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v5, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v5, s6, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v6, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v6, s6, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v7, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -748,37 +729,36 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v0, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX6-NEXT:    s_brev_b32 s6, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v1, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v2, v18
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v3, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v4, s6, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
@@ -786,70 +766,70 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v5, s6, v5
+; GFX6-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v6, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v6, s6, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v7, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v7, s6, v7
+; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v8, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v8, s6, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v9, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v9, s6, v9
+; GFX6-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v10, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v10
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v10, s6, v10
+; GFX6-NEXT:    v_xor_b32_e32 v10, 0x80000000, v10
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v11, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v11
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v11, s6, v11
+; GFX6-NEXT:    v_xor_b32_e32 v11, 0x80000000, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v12, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v12
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v12, s6, v12
+; GFX6-NEXT:    v_xor_b32_e32 v12, 0x80000000, v12
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v13, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v13
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v13, s6, v13
+; GFX6-NEXT:    v_xor_b32_e32 v13, 0x80000000, v13
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v14, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v14
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v14, s6, v14
+; GFX6-NEXT:    v_xor_b32_e32 v14, 0x80000000, v14
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -857,7 +837,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v15, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v15, s6, v15
+; GFX6-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -868,37 +848,36 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v0, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX8-NEXT:    s_brev_b32 s6, 1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v1, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v2, v18
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v3, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
@@ -906,70 +885,70 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v5, s6, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v6, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v6, s6, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v7, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v8, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v8
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v8, s6, v8
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v9, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v9
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v9, s6, v9
+; GFX8-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v10, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v10
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v10, s6, v10
+; GFX8-NEXT:    v_xor_b32_e32 v10, 0x80000000, v10
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v11, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v11
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v11, s6, v11
+; GFX8-NEXT:    v_xor_b32_e32 v11, 0x80000000, v11
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v12, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v12
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v12, s6, v12
+; GFX8-NEXT:    v_xor_b32_e32 v12, 0x80000000, v12
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v13, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v13
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v13, s6, v13
+; GFX8-NEXT:    v_xor_b32_e32 v13, 0x80000000, v13
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v14, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v14
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v14, s6, v14
+; GFX8-NEXT:    v_xor_b32_e32 v14, 0x80000000, v14
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -977,7 +956,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v15, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v15, s6, v15
+; GFX8-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
index 3831190a538d0..ac56f4e991e96 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
@@ -94,10 +94,9 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_fma_f16 v1, v1, v3, v5
 ; GFX9-NEXT:    v_fma_f16 v0, v0, v2, v4
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-NEXT:    v_fma_f16 v7, v9, v8, v7
-; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 3c74feb7ea441..bc1db5baaf59e 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -578,7 +578,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, v4, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GFX9-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index b4ee382704a78..9a8b1d09ca88f 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -8,11 +8,10 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX6-NEXT:    v_min_u32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i8:
@@ -46,11 +45,10 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX6-NEXT:    v_min_u32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i16:
@@ -110,15 +108,14 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
-; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX6-NEXT:    v_min_u32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_min_u32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -152,20 +149,19 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v3i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
+; GFX6-NEXT:    v_min_u32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX6-NEXT:    v_min_u32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_min_u32_e32 v3, s4, v2
+; GFX6-NEXT:    v_min_u32_e32 v3, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
@@ -202,25 +198,24 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v7, s4, v7
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX6-NEXT:    v_min_u32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_min_u32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v6
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v7
-; GFX6-NEXT:    v_min_u32_e32 v2, s4, v2
-; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
+; GFX6-NEXT:    v_min_u32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_min_u32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 79c7c8b3b6d81..609146dd48424 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -343,7 +343,6 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -353,8 +352,8 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; SI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; SI-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
-; SI-NEXT:    v_mul_f32_e32 v4, s2, v4
-; SI-NEXT:    v_mul_f32_e32 v5, s2, v5
+; SI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; SI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
 ; SI-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; SI-NEXT:    v_mul_lo_u32 v6, v6, v4
@@ -399,7 +398,6 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -409,8 +407,8 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; VI-NEXT:    v_sub_u32_e32 v7, vcc, 0, v3
-; VI-NEXT:    v_mul_f32_e32 v4, s2, v4
-; VI-NEXT:    v_mul_f32_e32 v5, s2, v5
+; VI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; VI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
 ; VI-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; VI-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; VI-NEXT:    v_mul_lo_u32 v6, v6, v4
@@ -451,15 +449,14 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GCN-NEXT:    v_mul_f32_e32 v4, s2, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v4
-; GCN-NEXT:    v_mul_f32_e32 v5, s2, v5
+; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v5
 ; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v4, v6
@@ -612,7 +609,6 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; SI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -624,10 +620,10 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; SI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; SI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
 ; SI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
-; SI-NEXT:    v_mul_f32_e32 v8, s2, v8
-; SI-NEXT:    v_mul_f32_e32 v10, s2, v10
-; SI-NEXT:    v_mul_f32_e32 v12, s2, v12
-; SI-NEXT:    v_mul_f32_e32 v14, s2, v14
+; SI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; SI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; SI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; SI-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
 ; SI-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; SI-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_u32_f32_e32 v12, v12
@@ -708,7 +704,6 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; VI-NEXT:    s_mov_b32 s5, s3
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -720,10 +715,10 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
-; VI-NEXT:    v_mul_f32_e32 v8, s2, v8
-; VI-NEXT:    v_mul_f32_e32 v10, s2, v10
-; VI-NEXT:    v_mul_f32_e32 v12, s2, v12
-; VI-NEXT:    v_mul_f32_e32 v14, s2, v14
+; VI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; VI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; VI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; VI-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
 ; VI-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; VI-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; VI-NEXT:    v_cvt_u32_f32_e32 v12, v12
@@ -804,7 +799,6 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GCN-NEXT:    v_mov_b32_e32 v8, s0
 ; GCN-NEXT:    v_mov_b32_e32 v9, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
@@ -816,10 +810,10 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v14, v14
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v16, v16
-; GCN-NEXT:    v_mul_f32_e32 v10, s2, v10
-; GCN-NEXT:    v_mul_f32_e32 v12, s2, v12
-; GCN-NEXT:    v_mul_f32_e32 v14, s2, v14
-; GCN-NEXT:    v_mul_f32_e32 v16, s2, v16
+; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; GCN-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
+; GCN-NEXT:    v_mul_f32_e32 v16, 0x4f7ffffe, v16
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v14, v14

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 3ed36b82ced7c..3186ad61d7c48 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1700,7 +1700,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GCN-NEXT:    v_mad_f32 v1, -v1, v0, s4
@@ -1721,7 +1721,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-IR-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v1, v0, s4
@@ -1795,7 +1795,7 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GCN-NEXT:    v_mad_f32 v1, -v1, v0, s4
@@ -1812,7 +1812,7 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-IR-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v1, v0, s4
@@ -1834,7 +1834,7 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x47000000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GCN-NEXT:    v_mad_f32 v1, -v1, v0, s4
@@ -1851,7 +1851,7 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-IR-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x47000000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v1, v0, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 5955a4492c224..86ad4598131fd 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -157,19 +157,18 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX6-LABEL: test_udivrem_v2:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s7
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
@@ -177,22 +176,22 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -202,18 +201,17 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX8-LABEL: test_udivrem_v2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX8-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX8-NEXT:    s_sub_i32 s2, 0, s7
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s2, v1
@@ -231,11 +229,11 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s7, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -320,26 +318,25 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX6-LABEL: test_udivrem_v4:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_i32 s2, 0, s8
-; GFX6-NEXT:    s_sub_i32 s12, 0, s9
+; GFX6-NEXT:    s_sub_i32 s12, 0, s8
+; GFX6-NEXT:    s_sub_i32 s13, 0, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
-; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
@@ -347,7 +344,7 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -367,7 +364,7 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX6-NEXT:    s_sub_i32 s4, 0, s11
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, s13, v4
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
@@ -399,7 +396,6 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-LABEL: test_udivrem_v4:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT:    s_mov_b32 s12, 0x4f7ffffe
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s8
@@ -410,9 +406,9 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s10
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s11
-; GFX8-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, s12, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v0
@@ -425,18 +421,18 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX8-NEXT:    v_mul_f32_e32 v2, s12, v3
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s9, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -444,7 +440,7 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX8-NEXT:    s_sub_i32 s2, 0, s11
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, s12, v4
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s9, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s6, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index ba3d120be9ba2..a0dd0e7e78f9d 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -5,12 +5,11 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0x1fff
-; CHECK-NEXT:    s_movk_i32 s5, 0x667
-; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
+; CHECK-NEXT:    s_movk_i32 s4, 0x667
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0xccd, v0
-; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i13 %X, 5
@@ -56,10 +55,9 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; CHECK-LABEL: test_urem_negative_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0x1ff
-; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0x133, v0
-; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -72,27 +70,26 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-LABEL: test_urem_vec:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0x7ff
-; CHECK-NEXT:    s_mov_b32 s5, 0x8311eb33
-; CHECK-NEXT:    s_mov_b32 s6, 0x20140c
-; CHECK-NEXT:    s_mov_b32 s7, 0xb6db6db7
-; CHECK-NEXT:    s_mov_b32 s8, 0x24924924
-; CHECK-NEXT:    s_mov_b32 s9, 0xaaaaaaab
-; CHECK-NEXT:    s_mov_b32 s10, 0x2aaaaaaa
-; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
-; CHECK-NEXT:    v_and_b32_e32 v1, s4, v1
-; CHECK-NEXT:    v_and_b32_e32 v2, s4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, s5
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, s7
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s9
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ff, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7ff, v1
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x7ff, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x8311eb33
+; CHECK-NEXT:    s_mov_b32 s5, 0x20140c
+; CHECK-NEXT:    s_mov_b32 s6, 0xb6db6db7
+; CHECK-NEXT:    s_mov_b32 s7, 0x24924924
+; CHECK-NEXT:    s_mov_b32 s8, 0xaaaaaaab
+; CHECK-NEXT:    s_mov_b32 s9, 0x2aaaaaaa
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, s4
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, s6
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xf9dc299a, v2
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0x49249249, v1
 ; CHECK-NEXT:    v_alignbit_b32 v0, v0, v0, 1
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v1
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v2
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 35079f42ac78e..1821045bf6963 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -486,36 +486,35 @@ define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s4, s5, 1
-; GCN-NEXT:    s_lshr_b32 s5, s9, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GCN-NEXT:    s_lshr_b32 s6, s11, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GCN-NEXT:    s_lshr_b32 s7, s7, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s7
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v3
-; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
-; GCN-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GCN-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GCN-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_lshr_b32 s5, s5, 1
+; GCN-NEXT:    s_lshr_b32 s6, s7, 1
+; GCN-NEXT:    s_lshr_b32 s7, s11, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s7
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GCN-NEXT:    v_mad_f32 v2, -v2, v3, v5
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GCN-NEXT:    s_brev_b32 s4, -2
-; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v5
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
-; GCN-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -527,36 +526,35 @@ define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_lshr_b32 s4, s5, 1
-; GCN-IR-NEXT:    s_lshr_b32 s5, s9, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GCN-IR-NEXT:    s_lshr_b32 s7, s7, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v5, s7
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v6, v3
-; GCN-IR-NEXT:    v_mul_f32_e32 v4, v2, v4
-; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GCN-IR-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    s_lshr_b32 s5, s5, 1
+; GCN-IR-NEXT:    s_lshr_b32 s6, s7, 1
+; GCN-IR-NEXT:    s_lshr_b32 s7, s11, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s7
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GCN-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v5, v6
-; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v3, v5
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GCN-IR-NEXT:    s_brev_b32 s4, -2
-; GCN-IR-NEXT:    v_and_b32_e32 v0, s4, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v4, v5
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 33>
@@ -632,36 +630,35 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out,
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s4, s5, 1
-; GCN-NEXT:    s_lshr_b32 s5, s9, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GCN-NEXT:    s_lshr_b32 s6, s11, 9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GCN-NEXT:    s_lshr_b32 s7, s7, 9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s7
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v3
-; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
-; GCN-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GCN-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GCN-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_lshr_b32 s5, s5, 1
+; GCN-NEXT:    s_lshr_b32 s6, s7, 9
+; GCN-NEXT:    s_lshr_b32 s7, s11, 9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s7
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GCN-NEXT:    v_mad_f32 v2, -v2, v3, v5
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GCN-NEXT:    s_brev_b32 s4, -2
-; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v5
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
-; GCN-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -673,36 +670,35 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out,
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_lshr_b32 s4, s5, 1
-; GCN-IR-NEXT:    s_lshr_b32 s5, s9, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 9
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GCN-IR-NEXT:    s_lshr_b32 s7, s7, 9
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v5, s7
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v6, v3
-; GCN-IR-NEXT:    v_mul_f32_e32 v4, v2, v4
-; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GCN-IR-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    s_lshr_b32 s5, s5, 1
+; GCN-IR-NEXT:    s_lshr_b32 s6, s7, 9
+; GCN-IR-NEXT:    s_lshr_b32 s7, s11, 9
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s7
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GCN-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v5, v6
-; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v3, v5
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GCN-IR-NEXT:    s_brev_b32 s4, -2
-; GCN-IR-NEXT:    v_and_b32_e32 v0, s4, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v4, v5
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 41>
@@ -1375,7 +1371,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s5, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GCN-NEXT:    v_mad_f32 v1, -v1, v0, s5
@@ -1398,7 +1394,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-IR-NEXT:    v_mul_f32_e32 v1, s5, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v1, v0, s5
@@ -1480,7 +1476,7 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
 ; GCN-NEXT:    v_mad_f32 v2, -v2, v1, s4
@@ -1499,7 +1495,7 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
 ; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v1, s4
@@ -1523,7 +1519,7 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
 ; GCN-NEXT:    v_mad_f32 v2, -v2, v1, s4
@@ -1542,7 +1538,7 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
 ; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v1, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 17b357de1e783..ba2e4ab191194 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -8,9 +8,8 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_usubsat_i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -43,9 +42,8 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_usubsat_i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -214,11 +212,10 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
@@ -256,16 +253,15 @@ define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v3i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v6, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_max_u32_e32 v1, v1, v6
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -305,18 +301,17 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v9, s4, v5
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff, v5
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
-; GFX6-NEXT:    v_and_b32_e32 v8, s4, v7
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index ee607351d5ba1..c627c78885fa0 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -953,13 +953,12 @@ define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, v2, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v2, v3
-; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_1100:
@@ -1366,7 +1365,7 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
-; GFX9-NEXT:    v_and_b32_e32 v1, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v1


        


More information about the llvm-commits mailing list