[llvm] 3eb2281 - [AMDGPU] Aggressively fold immediates in SIFoldOperands

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Wed May 18 02:21:59 PDT 2022


Author: Jay Foad
Date: 2022-05-18T10:19:35+01:00
New Revision: 3eb2281bc067688dc701cf94e267395680892cf0

URL: https://github.com/llvm/llvm-project/commit/3eb2281bc067688dc701cf94e267395680892cf0
DIFF: https://github.com/llvm/llvm-project/commit/3eb2281bc067688dc701cf94e267395680892cf0.diff

LOG: [AMDGPU] Aggressively fold immediates in SIFoldOperands

Previously SIFoldOperands::foldInstOperand would only fold a
non-inlinable immediate into a single user, so as not to increase code
size by adding the same 32-bit literal operand to many instructions.

This patch removes that restriction, so that a non-inlinable immediate
will be folded into any number of users. The rationale is:
- It reduces the number of registers used for holding constant values,
  which might increase occupancy. (On the other hand, many of these
  registers are SGPRs which no longer affect occupancy on GFX10+.)
- It reduces ALU stalls between the instruction that loads a constant
  into a register, and the instruction that uses it.
- The above benefits are expected to outweigh any increase in code size.

Differential Revision: https://reviews.llvm.org/D114643

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
    llvm/test/CodeGen/AMDGPU/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/and.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/bypass-div.ll
    llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
    llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
    llvm/test/CodeGen/AMDGPU/ctlz.ll
    llvm/test/CodeGen/AMDGPU/cttz.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
    llvm/test/CodeGen/AMDGPU/fabs.f16.ll
    llvm/test/CodeGen/AMDGPU/fabs.f64.ll
    llvm/test/CodeGen/AMDGPU/fabs.ll
    llvm/test/CodeGen/AMDGPU/fexp.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/fmed3.ll
    llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
    llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
    llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
    llvm/test/CodeGen/AMDGPU/fneg.ll
    llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
    llvm/test/CodeGen/AMDGPU/frem.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/immv216.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
    llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/test/CodeGen/AMDGPU/madak.ll
    llvm/test/CodeGen/AMDGPU/max.ll
    llvm/test/CodeGen/AMDGPU/mul.ll
    llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
    llvm/test/CodeGen/AMDGPU/or.ll
    llvm/test/CodeGen/AMDGPU/packed-fp32.ll
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
    llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
    llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
    llvm/test/CodeGen/AMDGPU/setcc-opt.ll
    llvm/test/CodeGen/AMDGPU/shift-i128.ll
    llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
    llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
    llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
    llvm/test/CodeGen/AMDGPU/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/udivrem24.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/usubsat.ll
    llvm/test/CodeGen/AMDGPU/v_pack.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
    llvm/test/CodeGen/AMDGPU/xor.ll
    llvm/test/CodeGen/AMDGPU/zero_extend.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 367951458ea45..99aa8a60b04f5 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -146,30 +146,6 @@ static unsigned macToMad(unsigned Opc) {
   return AMDGPU::INSTRUCTION_LIST_END;
 }
 
-// Wrapper around isInlineConstant that understands special cases when
-// instruction types are replaced during operand folding.
-static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
-                                     const MachineInstr &UseMI,
-                                     unsigned OpNo,
-                                     const MachineOperand &OpToFold) {
-  if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
-    return true;
-
-  unsigned Opc = UseMI.getOpcode();
-  unsigned NewOpc = macToMad(Opc);
-  if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
-    // Special case for mac. Since this is replaced with mad when folded into
-    // src2, we need to check the legality for the final instruction.
-    int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
-    if (static_cast<int>(OpNo) == Src2Idx) {
-      const MCInstrDesc &MadDesc = TII->get(NewOpc);
-      return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
-    }
-  }
-
-  return false;
-}
-
 // TODO: Add heuristic that the frame index might not fit in the addressing mode
 // immediate offset to avoid materializing in loops.
 static bool frameIndexMayFold(const SIInstrInfo *TII,
@@ -1267,59 +1243,13 @@ bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
     }
   }
 
-  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
-  if (FoldingImm) {
-    unsigned NumLiteralUses = 0;
-    MachineOperand *NonInlineUse = nullptr;
-    int NonInlineUseOpNo = -1;
-
-    for (auto &Use :
-         make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
-      MachineInstr *UseMI = Use.getParent();
-      unsigned OpNo = UseMI->getOperandNo(&Use);
-
-      // Try to fold any inline immediate uses, and then only fold other
-      // constants if they have one use.
-      //
-      // The legality of the inline immediate must be checked based on the use
-      // operand, not the defining instruction, because 32-bit instructions
-      // with 32-bit inline immediate sources may be used to materialize
-      // constants used in 16-bit operands.
-      //
-      // e.g. it is unsafe to fold:
-      //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
-      //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
-
-      // Folding immediates with more than one use will increase program size.
-      // FIXME: This will also reduce register usage, which may be better
-      // in some cases. A better heuristic is needed.
-      if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
-        foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
-      } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
-        foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
-      } else {
-        if (++NumLiteralUses == 1) {
-          NonInlineUse = &Use;
-          NonInlineUseOpNo = OpNo;
-        }
-      }
-    }
-
-    if (NumLiteralUses == 1) {
-      MachineInstr *UseMI = NonInlineUse->getParent();
-      foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
-    }
-  } else {
-    // Folding register.
-    SmallVector <MachineOperand *, 4> UsesToProcess;
-    for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
-      UsesToProcess.push_back(&Use);
-    for (auto U : UsesToProcess) {
-      MachineInstr *UseMI = U->getParent();
-
-      foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U),
-        FoldList, CopiesToReplace);
-    }
+  SmallVector<MachineOperand *, 4> UsesToProcess;
+  for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
+    UsesToProcess.push_back(&Use);
+  for (auto U : UsesToProcess) {
+    MachineInstr *UseMI = U->getParent();
+    foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
+                CopiesToReplace);
   }
 
   if (CopiesToReplace.empty() && FoldList.empty())

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index 1578076411dc0..aa29bf5ee65ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -127,9 +127,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0xffc0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_add_u16_e32 v1, s4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffffffc0
+; GFX8-NEXT:    v_add_u16_e32 v1, 0xffc0, v0
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -209,14 +208,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
-; GFX8-NEXT:    s_mov_b32 s1, 0xffc0
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
-; GFX8-NEXT:    s_add_i32 s2, s2, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_add_i32 s0, s0, 0xffc0
+; GFX8-NEXT:    s_add_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -243,13 +240,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_add_i32 s0, s0, 0xffc0
 ; GFX8-NEXT:    s_add_i32 s1, s1, 4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -276,13 +272,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_add_i32 s0, s0, 4
 ; GFX8-NEXT:    s_add_i32 s1, s1, 0xffc0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -310,15 +305,14 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ;
 ; GFX8-LABEL: s_add_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
-; GFX8-NEXT:    s_add_i32 s2, s2, s4
+; GFX8-NEXT:    s_add_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -349,15 +343,14 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
 ; GFX8-LABEL: s_add_v2i16_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
-; GFX8-NEXT:    s_add_i32 s2, s2, s4
+; GFX8-NEXT:    s_add_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -391,15 +384,14 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 ; GFX8-LABEL: s_add_v2i16_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
-; GFX8-NEXT:    s_add_i32 s2, s2, s4
+; GFX8-NEXT:    s_add_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -422,9 +414,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) {
 ; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0x80008000
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_xor_b32 s1, s1, s2
+; GFX9-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX9-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX9-NEXT:    s_add_i32 s0, s0, s1
@@ -434,26 +425,23 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ;
 ; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s2, 0x80008000
-; GFX8-NEXT:    s_xor_b32 s0, s0, s2
-; GFX8-NEXT:    s_xor_b32 s1, s1, s2
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
-; GFX8-NEXT:    s_add_i32 s2, s2, s4
+; GFX8-NEXT:    s_add_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0x80008000
-; GFX10-NEXT:    s_xor_b32 s0, s0, s2
-; GFX10-NEXT:    s_xor_b32 s1, s1, s2
+; GFX10-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX10-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index 646705337aabc..2280dc45e7de2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -967,7 +967,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
 ; GFX7-LABEL: uaddo_i16_sv:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s1, 0xffff
-; GFX7-NEXT:    s_and_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s1, v0
@@ -980,7 +980,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
 ; GFX8-LABEL: uaddo_i16_sv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s1, 0xffff
-; GFX8-NEXT:    s_and_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, s1, v0
@@ -992,8 +992,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
 ;
 ; GFX9-LABEL: uaddo_i16_sv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
-; GFX9-NEXT:    s_and_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 2e117b2e0f37c..99d3bf4f2c4ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -429,13 +429,12 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
 define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX6-NEXT:    s_and_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1
 define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v2i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX6-NEXT:    s_and_b32 s0, s1, s0
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -487,13 +485,12 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr
 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v2i16_multi_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX6-NEXT:    s_and_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
 ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s3, s4, s1
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
+; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
-; GFX6-NEXT:    s_and_b32 s1, s6, s1
-; GFX6-NEXT:    s_or_b32 s1, s3, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_and_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s1
+; GFX6-NEXT:    s_xor_b32 s2, s2, -1
+; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
@@ -630,18 +626,17 @@ define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1
 ; GFX6-LABEL: s_andn2_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_mov_b32 s3, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
-; GFX6-NEXT:    s_or_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
-; GFX6-NEXT:    s_and_b32 s3, s8, s3
-; GFX6-NEXT:    s_or_b32 s3, s4, s3
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
+; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
@@ -673,18 +668,17 @@ define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inr
 ; GFX6-LABEL: s_andn2_v4i16_commute:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_mov_b32 s3, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
-; GFX6-NEXT:    s_or_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
-; GFX6-NEXT:    s_and_b32 s3, s8, s3
-; GFX6-NEXT:    s_or_b32 s3, s4, s3
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
+; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
@@ -716,18 +710,17 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
 ; GFX6-LABEL: s_andn2_v4i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_mov_b32 s3, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
-; GFX6-NEXT:    s_or_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
-; GFX6-NEXT:    s_and_b32 s3, s8, s3
-; GFX6-NEXT:    s_or_b32 s3, s4, s3
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
+; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
@@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
 define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
 ; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s14, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s1, s2, s14
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX6-NEXT:    s_and_b32 s2, s4, s14
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
-; GFX6-NEXT:    s_and_b32 s3, s6, s14
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, s14
+; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s11, 16
-; GFX6-NEXT:    s_and_b32 s5, s10, s14
+; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
 ; GFX6-NEXT:    s_lshl_b32 s5, s13, 16
-; GFX6-NEXT:    s_and_b32 s6, s12, s14
+; GFX6-NEXT:    s_and_b32 s6, s12, 0xffff
 ; GFX6-NEXT:    s_or_b32 s5, s5, s6
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_mov_b32 s7, s6

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 7d393cd26a44e..a50950782f3c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -794,24 +794,22 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX6-LABEL: s_ashr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sext_i32_i16 s1, s1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, s3
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ashr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s3, 0x100010
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, s3
-; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, s3
-; GFX8-NEXT:    s_ashr_i32 s2, s2, s4
+; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s3, s1
+; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
+; GFX8-NEXT:    s_ashr_i32 s2, s2, s3
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
@@ -886,12 +884,12 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: ashr_v2i16_vs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s1, s2
+; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
@@ -994,45 +992,42 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX6-LABEL: s_ashr_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sext_i32_i16 s1, s1
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, s5
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX6-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX6-NEXT:    s_sext_i32_i16 s3, s3
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, s6
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, s7
-; GFX6-NEXT:    s_and_b32 s0, s0, s8
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s8
-; GFX6-NEXT:    s_and_b32 s2, s3, s8
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ashr_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s5, 0x100010
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, s5
-; GFX8-NEXT:    s_sext_i32_i16 s6, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, s5
-; GFX8-NEXT:    s_sext_i32_i16 s7, s2
-; GFX8-NEXT:    s_bfe_i32 s2, s2, s5
-; GFX8-NEXT:    s_sext_i32_i16 s8, s3
-; GFX8-NEXT:    s_bfe_i32 s3, s3, s5
-; GFX8-NEXT:    s_ashr_i32 s4, s4, s7
+; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s5, s1
+; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s6, s2
+; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s7, s3
+; GFX8-NEXT:    s_bfe_i32 s3, s3, 0x100010
+; GFX8-NEXT:    s_ashr_i32 s4, s4, s6
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
-; GFX8-NEXT:    s_ashr_i32 s2, s6, s8
+; GFX8-NEXT:    s_ashr_i32 s2, s5, s7
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s3
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_and_b32 s4, s4, s3
+; GFX8-NEXT:    s_and_b32 s3, s4, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
-; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s0, s3
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1191,79 +1186,76 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX6-LABEL: s_ashr_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sext_i32_i16 s1, s1
-; GFX6-NEXT:    s_mov_b32 s16, 0xffff
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, s9
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, s8
 ; GFX6-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX6-NEXT:    s_sext_i32_i16 s3, s3
-; GFX6-NEXT:    s_and_b32 s1, s1, s16
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, s10
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, s11
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s5
-; GFX6-NEXT:    s_and_b32 s0, s0, s16
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, s13
 ; GFX6-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s16
-; GFX6-NEXT:    s_and_b32 s2, s3, s16
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, s12
 ; GFX6-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX6-NEXT:    s_ashr_i32 s7, s7, s15
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s16
+; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s6, s6, s14
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, s16
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s3, s6, s16
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ashr_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s9, 0x100010
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, s9
-; GFX8-NEXT:    s_sext_i32_i16 s10, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, s9
-; GFX8-NEXT:    s_sext_i32_i16 s12, s3
-; GFX8-NEXT:    s_bfe_i32 s3, s3, s9
-; GFX8-NEXT:    s_sext_i32_i16 s13, s4
-; GFX8-NEXT:    s_bfe_i32 s4, s4, s9
-; GFX8-NEXT:    s_sext_i32_i16 s14, s5
-; GFX8-NEXT:    s_bfe_i32 s5, s5, s9
-; GFX8-NEXT:    s_sext_i32_i16 s16, s7
-; GFX8-NEXT:    s_bfe_i32 s7, s7, s9
-; GFX8-NEXT:    s_sext_i32_i16 s11, s2
-; GFX8-NEXT:    s_bfe_i32 s2, s2, s9
-; GFX8-NEXT:    s_sext_i32_i16 s15, s6
-; GFX8-NEXT:    s_bfe_i32 s6, s6, s9
+; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s9, s1
+; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s12, s4
+; GFX8-NEXT:    s_bfe_i32 s4, s4, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s13, s5
+; GFX8-NEXT:    s_bfe_i32 s5, s5, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s10, s2
+; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s14, s6
+; GFX8-NEXT:    s_bfe_i32 s6, s6, 0x100010
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
-; GFX8-NEXT:    s_ashr_i32 s4, s10, s14
+; GFX8-NEXT:    s_ashr_i32 s4, s9, s13
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s5
-; GFX8-NEXT:    s_ashr_i32 s3, s3, s7
-; GFX8-NEXT:    s_mov_b32 s7, 0xffff
-; GFX8-NEXT:    s_ashr_i32 s5, s11, s15
+; GFX8-NEXT:    s_sext_i32_i16 s11, s3
+; GFX8-NEXT:    s_bfe_i32 s3, s3, 0x100010
+; GFX8-NEXT:    s_sext_i32_i16 s15, s7
+; GFX8-NEXT:    s_bfe_i32 s7, s7, 0x100010
+; GFX8-NEXT:    s_ashr_i32 s5, s10, s14
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s4, s4, s7
-; GFX8-NEXT:    s_ashr_i32 s8, s8, s13
-; GFX8-NEXT:    s_ashr_i32 s6, s12, s16
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT:    s_ashr_i32 s8, s8, s12
+; GFX8-NEXT:    s_ashr_i32 s6, s11, s15
+; GFX8-NEXT:    s_ashr_i32 s3, s3, s7
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s4, s5, s7
+; GFX8-NEXT:    s_and_b32 s4, s5, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_and_b32 s8, s8, s7
+; GFX8-NEXT:    s_and_b32 s7, s8, 0xffff
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NEXT:    s_and_b32 s4, s6, s7
-; GFX8-NEXT:    s_or_b32 s0, s0, s8
+; GFX8-NEXT:    s_and_b32 s4, s6, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s0, s7
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index 1a18d7f138cd6..f017ca8ce2f89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -111,9 +111,8 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
 ;
 ; GFX10-LABEL: s_bswap_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0x10203
-; GFX10-NEXT:    v_perm_b32 v0, 0, s0, s2
-; GFX10-NEXT:    v_perm_b32 v1, 0, s1, s2
+; GFX10-NEXT:    v_perm_b32 v0, 0, s0, 0x10203
+; GFX10-NEXT:    v_perm_b32 v1, 0, s1, 0x10203
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -154,9 +153,8 @@ define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b32 s4, 0x10203
-; GFX10-NEXT:    v_perm_b32 v0, 0, v0, s4
-; GFX10-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x10203
+; GFX10-NEXT:    v_perm_b32 v1, 0, v1, 0x10203
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
   ret <2 x i32> %bswap
@@ -200,9 +198,8 @@ define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
 ;
 ; GFX10-LABEL: s_bswap_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0x10203
-; GFX10-NEXT:    v_perm_b32 v0, 0, s1, s2
-; GFX10-NEXT:    v_perm_b32 v1, 0, s0, s2
+; GFX10-NEXT:    v_perm_b32 v0, 0, s1, 0x10203
+; GFX10-NEXT:    v_perm_b32 v1, 0, s0, 0x10203
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -246,9 +243,8 @@ define i64 @v_bswap_i64(i64 %src) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b32 s4, 0x10203
-; GFX10-NEXT:    v_perm_b32 v2, 0, v1, s4
-; GFX10-NEXT:    v_perm_b32 v1, 0, v0, s4
+; GFX10-NEXT:    v_perm_b32 v2, 0, v1, 0x10203
+; GFX10-NEXT:    v_perm_b32 v1, 0, v0, 0x10203
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i64 @llvm.bswap.i64(i64 %src)
@@ -313,11 +309,10 @@ define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
 ;
 ; GFX10-LABEL: s_bswap_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s4, 0x10203
-; GFX10-NEXT:    v_perm_b32 v0, 0, s1, s4
-; GFX10-NEXT:    v_perm_b32 v1, 0, s0, s4
-; GFX10-NEXT:    v_perm_b32 v2, 0, s3, s4
-; GFX10-NEXT:    v_perm_b32 v3, 0, s2, s4
+; GFX10-NEXT:    v_perm_b32 v0, 0, s1, 0x10203
+; GFX10-NEXT:    v_perm_b32 v1, 0, s0, 0x10203
+; GFX10-NEXT:    v_perm_b32 v2, 0, s3, 0x10203
+; GFX10-NEXT:    v_perm_b32 v3, 0, s2, 0x10203
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
@@ -376,11 +371,10 @@ define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b32 s4, 0x10203
-; GFX10-NEXT:    v_perm_b32 v4, 0, v1, s4
-; GFX10-NEXT:    v_perm_b32 v5, 0, v3, s4
-; GFX10-NEXT:    v_perm_b32 v1, 0, v0, s4
-; GFX10-NEXT:    v_perm_b32 v3, 0, v2, s4
+; GFX10-NEXT:    v_perm_b32 v4, 0, v1, 0x10203
+; GFX10-NEXT:    v_perm_b32 v5, 0, v3, 0x10203
+; GFX10-NEXT:    v_perm_b32 v1, 0, v0, 0x10203
+; GFX10-NEXT:    v_perm_b32 v3, 0, v2, 0x10203
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -457,12 +451,11 @@ define i16 @v_bswap_i16(i16 %src) {
 define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
 ; GFX7-LABEL: s_bswap_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s3, 0x80008
 ; GFX7-NEXT:    s_lshl_b32 s2, s0, 8
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s3
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_lshl_b32 s2, s1, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s3
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x100000
@@ -647,9 +640,8 @@ define i64 @v_bswap_i48(i64 %src) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b32 s4, 0x10203
-; GFX10-NEXT:    v_perm_b32 v1, 0, v1, s4
-; GFX10-NEXT:    v_perm_b32 v2, 0, v0, s4
+; GFX10-NEXT:    v_perm_b32 v1, 0, v1, 0x10203
+; GFX10-NEXT:    v_perm_b32 v2, 0, v0, 0x10203
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc i64 %src to i48

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 785efb6c4375d..f4782f54b891a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -901,30 +901,29 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-NEXT:    v_and_or_b32 v3, v3, v8, s4
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v8, v6
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v8, v7
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v3, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v6
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, v7
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, s4
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_and_or_b32 v2, v4, v8, v2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v8, v6
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v4, v2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v6
 ; GFX10-NEXT:    v_pk_add_f16 v0, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v2, v5, v8, s4
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v5, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_pk_add_f16 v1, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v8, v3
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs:
@@ -934,23 +933,22 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
 ; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX10-CONTRACT-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
 ; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v0, v0, v9, v6
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v2, v2, v9, v7
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v4, v4, v9, v8
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v0, 0xffff, v0, v6
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v2, 0xffff, v2, v7
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v4, 0xffff, v4, v8
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v2, v3, v9, s4
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v4, v5, v9, s4
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v2, 0xffff, v3, s4
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v4, 0xffff, v5, s4
 ; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v2, v4
 ; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v0, v0, v9, v3
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs:
@@ -959,30 +957,29 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
 ; GFX10-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-DENORM-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX10-DENORM-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-DENORM-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-DENORM-NEXT:    v_and_or_b32 v3, v3, v8, s4
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v0, v8, v6
-; GFX10-DENORM-NEXT:    v_and_or_b32 v2, v2, v8, v7
+; GFX10-DENORM-NEXT:    v_and_or_b32 v3, 0xffff, v3, s4
+; GFX10-DENORM-NEXT:    v_and_or_b32 v0, 0xffff, v0, v6
+; GFX10-DENORM-NEXT:    v_and_or_b32 v2, 0xffff, v2, v7
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v1, v8, s4
+; GFX10-DENORM-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
 ; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-DENORM-NEXT:    v_and_or_b32 v2, v4, v8, v2
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v0, v8, v6
+; GFX10-DENORM-NEXT:    v_and_or_b32 v2, 0xffff, v4, v2
+; GFX10-DENORM-NEXT:    v_and_or_b32 v0, 0xffff, v0, v6
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_and_or_b32 v2, v5, v8, s4
+; GFX10-DENORM-NEXT:    v_and_or_b32 v2, 0xffff, v5, s4
 ; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v2, v1
 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v0, v8, v3
+; GFX10-DENORM-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
+; GFX10-DENORM-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
@@ -992,23 +989,22 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
 ; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX10-UNSAFE-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
 ; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v0, v0, v9, v6
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v2, v2, v9, v7
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v4, v4, v9, v8
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v0, 0xffff, v0, v6
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v2, 0xffff, v2, v7
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v4, 0xffff, v4, v8
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v2, v3, v9, s4
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v4, v5, v9, s4
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v2, 0xffff, v3, s4
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v4, 0xffff, v5, s4
 ; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v2, v4
 ; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v0, v0, v9, v3
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v1, 0xffff, v1, s4
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <3 x half> %x, %y

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll
index bb0a3b352eb37..84002c0e3f22b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll
@@ -113,10 +113,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul(<4 x half> %x,
 ;
 ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_mov_b32 s0, 0x80008000
+; GFX10-DENORM-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-DENORM-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_e32 v8, v0
-; GFX10-DENORM-NEXT:    v_xor_b32_e32 v2, s0, v2
-; GFX10-DENORM-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_e32 v10, v1
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -155,10 +154,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul(<4 x half> %x,
 ;
 ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_mov_b32 s0, 0x80008000
+; GFX10-DENORM-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-DENORM-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_e32 v8, v0
-; GFX10-DENORM-NEXT:    v_xor_b32_e32 v2, s0, v2
-; GFX10-DENORM-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_e32 v10, v1
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -198,10 +196,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul2(<4 x float> %
 ;
 ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_mov_b32 s0, 0x80008000
+; GFX10-DENORM-NEXT:    v_xor_b32_e32 v6, 0x80008000, v6
+; GFX10-DENORM-NEXT:    v_xor_b32_e32 v7, 0x80008000, v7
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_e32 v8, v4
-; GFX10-DENORM-NEXT:    v_xor_b32_e32 v6, s0, v6
-; GFX10-DENORM-NEXT:    v_xor_b32_e32 v7, s0, v7
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_e32 v9, v5
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -240,10 +237,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul2(<4 x float> %
 ;
 ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_mov_b32 s0, 0x80008000
+; GFX10-DENORM-NEXT:    v_xor_b32_e32 v6, 0x80008000, v6
+; GFX10-DENORM-NEXT:    v_xor_b32_e32 v7, 0x80008000, v7
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_e32 v8, v4
-; GFX10-DENORM-NEXT:    v_xor_b32_e32 v6, s0, v6
-; GFX10-DENORM-NEXT:    v_xor_b32_e32 v7, s0, v7
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_e32 v9, v5
 ; GFX10-DENORM-NEXT:    v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index c40b187134789..088c86548401b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -488,10 +488,9 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-NEXT:    v_add_f16_e64 v2, v0, -v4
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_add_f16_e64 v3, v1, -v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v2, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, v3, v4, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -511,10 +510,9 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-DENORM-NEXT:    v_add_f16_e64 v2, v0, -v4
 ; GFX10-DENORM-NEXT:    v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_add_f16_e64 v3, v1, -v5
-; GFX10-DENORM-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-DENORM-NEXT:    v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v2, v4, v0
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v3, v4, v1
+; GFX10-DENORM-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-DENORM-NEXT:    v_and_or_b32 v1, 0xffff, v3, v1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x half> %x, %y
@@ -567,10 +565,9 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-NEXT:    v_add_f16_e64 v2, v4, -v0
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_add_f16_e64 v3, v5, -v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v2, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, v3, v4, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -590,10 +587,9 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-DENORM-NEXT:    v_add_f16_e64 v2, v4, -v0
 ; GFX10-DENORM-NEXT:    v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_add_f16_e64 v3, v5, -v1
-; GFX10-DENORM-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-DENORM-NEXT:    v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v2, v4, v0
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v3, v4, v1
+; GFX10-DENORM-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-DENORM-NEXT:    v_and_or_b32 v1, 0xffff, v3, v1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x half> %x, %y

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index f148d9b10ac76..fb96801ca2cc3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -90,9 +90,8 @@ define half @test_f16_sub_ext_neg_mul(half %x, half %y, half %z) {
 ; GFX10-CONTRACT:       ; %bb.0: ; %entry
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CONTRACT-NEXT:    s_mov_b32 s4, 0x8000
-; GFX10-CONTRACT-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX10-CONTRACT-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX10-CONTRACT-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX10-CONTRACT-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX10-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -278,10 +277,9 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-NEXT:    v_add_f16_e64 v2, v0, -v4
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_add_f16_e64 v3, v1, -v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v2, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, v3, v4, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -301,10 +299,9 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-DENORM-NEXT:    v_add_f16_e64 v2, v0, -v4
 ; GFX10-DENORM-NEXT:    v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_add_f16_e64 v3, v1, -v5
-; GFX10-DENORM-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-DENORM-NEXT:    v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v2, v4, v0
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v3, v4, v1
+; GFX10-DENORM-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-DENORM-NEXT:    v_and_or_b32 v1, 0xffff, v3, v1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul <4 x half> %x, %y

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index a8e207856f5df..bb35b9c25d25a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -763,29 +763,26 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_mov_b32 s7, 0x80008
-; GCN-NEXT:    s_movk_i32 s2, 0xff
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_u32 s8, s0, s7
-; GCN-NEXT:    s_and_b32 s6, s0, s2
-; GCN-NEXT:    s_lshl_b32 s8, s8, 8
-; GCN-NEXT:    s_or_b32 s6, s6, s8
-; GCN-NEXT:    s_mov_b32 s8, 0x80010
-; GCN-NEXT:    s_lshr_b32 s3, s0, 24
-; GCN-NEXT:    s_bfe_u32 s0, s0, s8
+; GCN-NEXT:    s_bfe_u32 s6, s0, 0x80008
+; GCN-NEXT:    s_lshr_b32 s2, s0, 24
+; GCN-NEXT:    s_and_b32 s5, s0, 0xff
+; GCN-NEXT:    s_lshl_b32 s6, s6, 8
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GCN-NEXT:    s_or_b32 s5, s5, s6
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
-; GCN-NEXT:    s_or_b32 s0, s6, s0
-; GCN-NEXT:    s_lshl_b32 s3, s3, 24
-; GCN-NEXT:    s_or_b32 s0, s0, s3
-; GCN-NEXT:    s_bfe_u32 s3, s1, s7
-; GCN-NEXT:    s_lshr_b32 s5, s1, 24
-; GCN-NEXT:    s_and_b32 s2, s1, s2
-; GCN-NEXT:    s_lshl_b32 s3, s3, 8
-; GCN-NEXT:    s_bfe_u32 s1, s1, s8
-; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_or_b32 s0, s5, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_bfe_u32 s5, s1, 0x80008
+; GCN-NEXT:    s_lshr_b32 s3, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s2, s1, 0xff
+; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GCN-NEXT:    s_or_b32 s2, s2, s5
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
-; GCN-NEXT:    s_lshl_b32 s2, s5, 24
+; GCN-NEXT:    s_lshl_b32 s2, s3, 24
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
 ; GCN-NEXT:    s_lshr_b32 s2, s4, 2
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 1
@@ -798,32 +795,29 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in
 ; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
-; GFX10-NEXT:    s_mov_b32 s5, 0x80010
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 2
+; GFX10-NEXT:    s_lshr_b32 s2, s4, 2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s10, s0, s3
-; GFX10-NEXT:    s_bfe_u32 s3, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX10-NEXT:    s_and_b32 s9, s0, s2
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s5
-; GFX10-NEXT:    s_and_b32 s2, s1, s2
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s5
-; GFX10-NEXT:    s_lshl_b32 s5, s10, 8
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_bfe_u32 s7, s0, 0x80008
+; GFX10-NEXT:    s_bfe_u32 s9, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX10-NEXT:    s_and_b32 s6, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_and_b32 s8, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX10-NEXT:    s_lshl_b32 s9, s9, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s5, s9, s5
-; GFX10-NEXT:    s_or_b32 s2, s2, s3
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 24
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_or_b32 s0, s5, s0
-; GFX10-NEXT:    s_or_b32 s1, s2, s1
-; GFX10-NEXT:    s_or_b32 s0, s0, s7
-; GFX10-NEXT:    s_or_b32 s1, s1, s8
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_or_b32 s7, s8, s9
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX10-NEXT:    s_or_b32 s0, s6, s0
+; GFX10-NEXT:    s_or_b32 s1, s7, s1
+; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_or_b32 s1, s1, s5
+; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX10-NEXT:    s_cselect_b32 s0, s1, s0
 ; GFX10-NEXT:    s_and_b32 s1, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 3
@@ -934,7 +928,6 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -942,9 +935,9 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v5
+; GFX10-NEXT:    v_and_or_b32 v1, v1, 0xff, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    s_lshr_b32 s0, s2, 2
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v2
@@ -1063,7 +1056,6 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 8
 ; GFX10-NEXT:    s_mov_b32 s5, 16
-; GFX10-NEXT:    s_movk_i32 s6, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -1071,9 +1063,9 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s6, v4
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s6, v6
+; GFX10-NEXT:    v_and_or_b32 v1, v1, 0xff, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
@@ -1093,37 +1085,34 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_mov_b32 s6, 0x80008
-; GCN-NEXT:    s_movk_i32 s2, 0xff
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_u32 s7, s0, s6
-; GCN-NEXT:    s_and_b32 s5, s0, s2
-; GCN-NEXT:    s_lshl_b32 s7, s7, 8
-; GCN-NEXT:    s_or_b32 s5, s5, s7
-; GCN-NEXT:    s_mov_b32 s7, 0x80010
-; GCN-NEXT:    s_lshr_b32 s3, s0, 24
-; GCN-NEXT:    s_bfe_u32 s0, s0, s7
+; GCN-NEXT:    s_bfe_u32 s5, s0, 0x80008
+; GCN-NEXT:    s_lshr_b32 s2, s0, 24
+; GCN-NEXT:    s_and_b32 s4, s0, 0xff
+; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
-; GCN-NEXT:    s_or_b32 s0, s5, s0
-; GCN-NEXT:    s_lshl_b32 s3, s3, 24
-; GCN-NEXT:    s_or_b32 s0, s0, s3
-; GCN-NEXT:    s_bfe_u32 s3, s1, s6
-; GCN-NEXT:    s_lshr_b32 s4, s1, 24
-; GCN-NEXT:    s_and_b32 s2, s1, s2
-; GCN-NEXT:    s_lshl_b32 s3, s3, 8
-; GCN-NEXT:    s_bfe_u32 s1, s1, s7
-; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_or_b32 s0, s4, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_bfe_u32 s4, s1, 0x80008
+; GCN-NEXT:    s_lshr_b32 s3, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s2, s1, 0xff
+; GCN-NEXT:    s_lshl_b32 s4, s4, 8
+; GCN-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GCN-NEXT:    s_or_b32 s2, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
-; GCN-NEXT:    s_lshl_b32 s2, s4, 24
+; GCN-NEXT:    s_lshl_b32 s2, s3, 24
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -1131,33 +1120,30 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
 ; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
-; GFX10-NEXT:    s_mov_b32 s4, 0x80010
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s8, s0, s3
-; GFX10-NEXT:    s_bfe_u32 s3, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 24
-; GFX10-NEXT:    s_and_b32 s7, s0, s2
-; GFX10-NEXT:    s_and_b32 s2, s1, s2
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s4
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX10-NEXT:    s_and_b32 s6, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s2, s2, s3
-; GFX10-NEXT:    s_lshl_b32 s3, s6, 24
-; GFX10-NEXT:    s_or_b32 s1, s2, s1
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s4
-; GFX10-NEXT:    s_lshl_b32 s4, s8, 8
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_bfe_u32 s5, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX10-NEXT:    s_or_b32 s1, s6, s1
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX10-NEXT:    s_and_b32 s4, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s3, s7, s4
+; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-NEXT:    s_lshl_b32 s2, s5, 24
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 24
 ; GFX10-NEXT:    s_or_b32 s0, s3, s0
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
@@ -2089,45 +2075,42 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_mov_b32 s11, 0x80008
-; GCN-NEXT:    s_movk_i32 s9, 0xff
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_u32 s12, s0, s11
-; GCN-NEXT:    s_and_b32 s10, s0, s9
-; GCN-NEXT:    s_lshl_b32 s12, s12, 8
-; GCN-NEXT:    s_or_b32 s10, s10, s12
-; GCN-NEXT:    s_mov_b32 s12, 0x80010
+; GCN-NEXT:    s_bfe_u32 s10, s0, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s5, s0, 24
-; GCN-NEXT:    s_bfe_u32 s0, s0, s12
+; GCN-NEXT:    s_and_b32 s9, s0, 0xff
+; GCN-NEXT:    s_lshl_b32 s10, s10, 8
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GCN-NEXT:    s_or_b32 s9, s9, s10
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
-; GCN-NEXT:    s_or_b32 s0, s10, s0
+; GCN-NEXT:    s_or_b32 s0, s9, s0
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 24
-; GCN-NEXT:    s_bfe_u32 s10, s1, s11
+; GCN-NEXT:    s_bfe_u32 s9, s1, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s6, s1, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s5
-; GCN-NEXT:    s_and_b32 s5, s1, s9
-; GCN-NEXT:    s_lshl_b32 s10, s10, 8
-; GCN-NEXT:    s_bfe_u32 s1, s1, s12
-; GCN-NEXT:    s_or_b32 s5, s5, s10
+; GCN-NEXT:    s_and_b32 s5, s1, 0xff
+; GCN-NEXT:    s_lshl_b32 s9, s9, 8
+; GCN-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GCN-NEXT:    s_or_b32 s5, s5, s9
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s1, s5, s1
 ; GCN-NEXT:    s_lshl_b32 s5, s6, 24
-; GCN-NEXT:    s_bfe_u32 s6, s2, s11
+; GCN-NEXT:    s_bfe_u32 s6, s2, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s7, s2, 24
 ; GCN-NEXT:    s_or_b32 s1, s1, s5
-; GCN-NEXT:    s_and_b32 s5, s2, s9
+; GCN-NEXT:    s_and_b32 s5, s2, 0xff
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 8
-; GCN-NEXT:    s_bfe_u32 s2, s2, s12
+; GCN-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GCN-NEXT:    s_or_b32 s5, s5, s6
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    s_or_b32 s2, s5, s2
 ; GCN-NEXT:    s_lshl_b32 s5, s7, 24
-; GCN-NEXT:    s_bfe_u32 s6, s3, s11
+; GCN-NEXT:    s_bfe_u32 s6, s3, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s8, s3, 24
 ; GCN-NEXT:    s_or_b32 s2, s2, s5
-; GCN-NEXT:    s_and_b32 s5, s3, s9
+; GCN-NEXT:    s_and_b32 s5, s3, 0xff
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 8
-; GCN-NEXT:    s_bfe_u32 s3, s3, s12
+; GCN-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GCN-NEXT:    s_or_b32 s5, s5, s6
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 16
 ; GCN-NEXT:    s_or_b32 s3, s5, s3
@@ -2148,50 +2131,47 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
 ; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s6, 0x80008
-; GFX10-NEXT:    s_movk_i32 s5, 0xff
-; GFX10-NEXT:    s_mov_b32 s7, 0x80010
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s13, s0, s6
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX10-NEXT:    s_and_b32 s12, s0, s5
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s7
-; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX10-NEXT:    s_bfe_u32 s15, s1, s6
-; GFX10-NEXT:    s_bfe_u32 s17, s2, s6
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s12, s12, s13
-; GFX10-NEXT:    s_bfe_u32 s6, s3, s6
-; GFX10-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
-; GFX10-NEXT:    s_and_b32 s14, s1, s5
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s7
-; GFX10-NEXT:    s_and_b32 s16, s2, s5
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_lshl_b32 s15, s15, 8
-; GFX10-NEXT:    s_lshl_b32 s17, s17, 8
-; GFX10-NEXT:    s_or_b32 s0, s12, s0
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s7
-; GFX10-NEXT:    s_and_b32 s5, s3, s5
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX10-NEXT:    s_bfe_u32 s3, s3, s7
+; GFX10-NEXT:    s_bfe_u32 s10, s0, 0x80008
+; GFX10-NEXT:    s_bfe_u32 s12, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX10-NEXT:    s_and_b32 s9, s0, 0xff
+; GFX10-NEXT:    s_and_b32 s11, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX10-NEXT:    s_lshl_b32 s12, s12, 8
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s13, s14, s15
-; GFX10-NEXT:    s_or_b32 s0, s0, s8
-; GFX10-NEXT:    s_or_b32 s8, s16, s17
+; GFX10-NEXT:    s_or_b32 s9, s9, s10
+; GFX10-NEXT:    s_or_b32 s10, s11, s12
+; GFX10-NEXT:    s_bfe_u32 s14, s2, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX10-NEXT:    s_or_b32 s1, s10, s1
+; GFX10-NEXT:    s_lshr_b32 s7, s2, 24
+; GFX10-NEXT:    s_and_b32 s13, s2, 0xff
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX10-NEXT:    s_or_b32 s0, s9, s0
+; GFX10-NEXT:    s_or_b32 s1, s1, s6
+; GFX10-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s8, s3, 24
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX10-NEXT:    s_or_b32 s5, s5, s6
+; GFX10-NEXT:    s_or_b32 s11, s13, s14
+; GFX10-NEXT:    s_or_b32 s0, s0, s5
+; GFX10-NEXT:    s_lshl_b32 s5, s7, 24
+; GFX10-NEXT:    s_and_b32 s7, s3, 0xff
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x80010
+; GFX10-NEXT:    s_or_b32 s2, s11, s2
+; GFX10-NEXT:    s_or_b32 s6, s7, s6
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
-; GFX10-NEXT:    s_or_b32 s1, s13, s1
-; GFX10-NEXT:    s_or_b32 s2, s8, s2
-; GFX10-NEXT:    s_lshl_b32 s8, s10, 24
-; GFX10-NEXT:    s_or_b32 s3, s5, s3
-; GFX10-NEXT:    s_lshl_b32 s5, s11, 24
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
+; GFX10-NEXT:    s_or_b32 s3, s6, s3
+; GFX10-NEXT:    s_lshl_b32 s5, s8, 24
 ; GFX10-NEXT:    s_lshr_b32 s6, s4, 2
-; GFX10-NEXT:    s_or_b32 s1, s1, s9
-; GFX10-NEXT:    s_or_b32 s2, s2, s8
 ; GFX10-NEXT:    s_or_b32 s3, s3, s5
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
 ; GFX10-NEXT:    s_cselect_b32 s0, s1, s0
@@ -2371,37 +2351,35 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s3, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v6, 16
+; GFX10-NEXT:    v_mov_b32_e32 v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_and_or_b32 v1, v1, 0xff, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v4, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
-; GFX10-NEXT:    v_or3_b32 v0, v0, v11, v7
-; GFX10-NEXT:    v_or3_b32 v1, v1, v13, v8
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v2, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX10-NEXT:    v_or3_b32 v0, v0, v10, v6
+; GFX10-NEXT:    v_or3_b32 v1, v1, v12, v7
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT:    v_or3_b32 v2, v2, v15, v9
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v4, v5
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
+; GFX10-NEXT:    v_or3_b32 v2, v2, v14, v8
+; GFX10-NEXT:    v_and_or_b32 v4, 0xff, v3, v4
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 2
 ; GFX10-NEXT:    v_or3_b32 v1, v4, v3, v5
@@ -2583,43 +2561,41 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v1, 8
+; GFX10-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX10-NEXT:    s_mov_b32 s5, 16
-; GFX10-NEXT:    s_movk_i32 s6, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v7, 16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 2, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s6, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_and_or_b32 v3, v3, 0xff, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX10-NEXT:    v_and_or_b32 v4, v4, 0xff, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s6, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_and_or_b32 v5, 0xff, v5, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v5, v5, v0, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX10-NEXT:    v_or3_b32 v3, v3, v14, v9
-; GFX10-NEXT:    v_or3_b32 v4, v4, v16, v10
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v6, v0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v12
-; GFX10-NEXT:    v_or3_b32 v5, v5, v18, v11
+; GFX10-NEXT:    v_or3_b32 v3, v3, v13, v8
+; GFX10-NEXT:    v_or3_b32 v4, v4, v15, v9
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
+; GFX10-NEXT:    v_or3_b32 v5, v5, v17, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v8
-; GFX10-NEXT:    v_or3_b32 v0, v0, v7, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v7
+; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
@@ -2633,47 +2609,46 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_mov_b32 s10, 0x80008
-; GCN-NEXT:    s_movk_i32 s8, 0xff
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_u32 s11, s0, s10
-; GCN-NEXT:    s_and_b32 s9, s0, s8
-; GCN-NEXT:    s_lshl_b32 s11, s11, 8
-; GCN-NEXT:    s_or_b32 s9, s9, s11
-; GCN-NEXT:    s_mov_b32 s11, 0x80010
+; GCN-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s4, s0, 24
-; GCN-NEXT:    s_bfe_u32 s0, s0, s11
+; GCN-NEXT:    s_and_b32 s8, s0, 0xff
+; GCN-NEXT:    s_lshl_b32 s9, s9, 8
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GCN-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
-; GCN-NEXT:    s_or_b32 s0, s9, s0
+; GCN-NEXT:    s_or_b32 s0, s8, s0
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 24
-; GCN-NEXT:    s_bfe_u32 s9, s1, s10
+; GCN-NEXT:    s_bfe_u32 s8, s1, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s5, s1, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s4
-; GCN-NEXT:    s_and_b32 s4, s1, s8
-; GCN-NEXT:    s_lshl_b32 s9, s9, 8
-; GCN-NEXT:    s_bfe_u32 s1, s1, s11
-; GCN-NEXT:    s_or_b32 s4, s4, s9
+; GCN-NEXT:    s_and_b32 s4, s1, 0xff
+; GCN-NEXT:    s_lshl_b32 s8, s8, 8
+; GCN-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GCN-NEXT:    s_or_b32 s4, s4, s8
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s1, s4, s1
 ; GCN-NEXT:    s_lshl_b32 s4, s5, 24
-; GCN-NEXT:    s_bfe_u32 s5, s2, s10
+; GCN-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s6, s2, 24
 ; GCN-NEXT:    s_or_b32 s1, s1, s4
-; GCN-NEXT:    s_and_b32 s4, s2, s8
+; GCN-NEXT:    s_and_b32 s4, s2, 0xff
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 8
-; GCN-NEXT:    s_bfe_u32 s2, s2, s11
+; GCN-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    s_or_b32 s2, s4, s2
 ; GCN-NEXT:    s_lshl_b32 s4, s6, 24
-; GCN-NEXT:    s_bfe_u32 s5, s3, s10
+; GCN-NEXT:    s_bfe_u32 s5, s3, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s7, s3, 24
 ; GCN-NEXT:    s_or_b32 s2, s2, s4
-; GCN-NEXT:    s_and_b32 s4, s3, s8
+; GCN-NEXT:    s_and_b32 s4, s3, 0xff
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 8
-; GCN-NEXT:    s_bfe_u32 s3, s3, s11
+; GCN-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 16
 ; GCN-NEXT:    s_or_b32 s3, s4, s3
@@ -2687,9 +2662,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -2697,58 +2670,55 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s5, 0x80008
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    s_mov_b32 s6, 0x80010
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s12, s0, s5
-; GFX10-NEXT:    s_bfe_u32 s14, s1, s5
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX10-NEXT:    s_and_b32 s11, s0, s4
-; GFX10-NEXT:    s_and_b32 s13, s1, s4
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s6
-; GFX10-NEXT:    s_lshl_b32 s12, s12, 8
-; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX10-NEXT:    s_bfe_u32 s9, s0, 0x80008
+; GFX10-NEXT:    s_bfe_u32 s11, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX10-NEXT:    s_and_b32 s8, s0, 0xff
+; GFX10-NEXT:    s_and_b32 s10, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX10-NEXT:    s_lshl_b32 s11, s11, 8
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s11, s11, s12
-; GFX10-NEXT:    s_or_b32 s12, s13, s14
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_or_b32 s1, s12, s1
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s6
-; GFX10-NEXT:    s_or_b32 s1, s1, s8
+; GFX10-NEXT:    s_or_b32 s8, s8, s9
+; GFX10-NEXT:    s_or_b32 s9, s10, s11
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX10-NEXT:    s_or_b32 s1, s9, s1
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_or_b32 s1, s1, s5
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_bfe_u32 s16, s2, s5
+; GFX10-NEXT:    s_bfe_u32 s13, s2, 0x80008
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 24
-; GFX10-NEXT:    s_or_b32 s0, s11, s0
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 24
-; GFX10-NEXT:    s_and_b32 s15, s2, s4
-; GFX10-NEXT:    s_lshl_b32 s16, s16, 8
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s6
-; GFX10-NEXT:    s_or_b32 s0, s0, s7
-; GFX10-NEXT:    s_or_b32 s7, s15, s16
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX10-NEXT:    s_or_b32 s0, s8, s0
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX10-NEXT:    s_and_b32 s12, s2, 0xff
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
+; GFX10-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX10-NEXT:    s_bfe_u32 s5, s3, s5
+; GFX10-NEXT:    s_or_b32 s10, s12, s13
+; GFX10-NEXT:    s_bfe_u32 s5, s3, 0x80008
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX10-NEXT:    s_or_b32 s2, s7, s2
-; GFX10-NEXT:    s_lshl_b32 s7, s9, 24
-; GFX10-NEXT:    s_and_b32 s4, s3, s4
+; GFX10-NEXT:    s_or_b32 s2, s10, s2
+; GFX10-NEXT:    s_lshl_b32 s4, s6, 24
+; GFX10-NEXT:    s_and_b32 s6, s3, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_bfe_u32 s1, s3, s6
-; GFX10-NEXT:    s_or_b32 s2, s2, s7
-; GFX10-NEXT:    s_lshr_b32 s10, s3, 24
-; GFX10-NEXT:    s_or_b32 s3, s4, s5
+; GFX10-NEXT:    s_bfe_u32 s1, s3, 0x80010
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    s_lshr_b32 s7, s3, 24
+; GFX10-NEXT:    s_or_b32 s3, s6, s5
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
 ; GFX10-NEXT:    s_or_b32 s0, s3, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s10, 24
+; GFX10-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX10-NEXT:    s_or_b32 s3, s0, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 140c3ff694079..9e1dc06de9331 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -901,19 +901,17 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-FLUSH:       ; %bb.0:
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s4, 0x6f800000
-; GFX10-FLUSH-NEXT:    s_mov_b32 s5, 0x2f800000
-; GFX10-FLUSH-NEXT:    v_cmp_gt_f32_e64 s6, |v2|, s4
-; GFX10-FLUSH-NEXT:    v_cmp_gt_f32_e64 s4, |v3|, s4
-; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v4, 1.0, s5, s6
-; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v5, 1.0, s5, s4
+; GFX10-FLUSH-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v2|
+; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4
+; GFX10-FLUSH-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v3|
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v5, v1
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x float> %a, %b, !fpmath !0
@@ -1334,19 +1332,17 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
 ; GFX10-IEEE:       ; %bb.0:
 ; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-IEEE-NEXT:    s_mov_b32 s4, 0x6f800000
-; GFX10-IEEE-NEXT:    s_mov_b32 s5, 0x2f800000
-; GFX10-IEEE-NEXT:    v_cmp_gt_f32_e64 s6, |v0|, s4
-; GFX10-IEEE-NEXT:    v_cmp_gt_f32_e64 s4, |v1|, s4
-; GFX10-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, s5, s6
-; GFX10-IEEE-NEXT:    v_cndmask_b32_e64 v3, 1.0, s5, s4
+; GFX10-IEEE-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v0|
+; GFX10-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
+; GFX10-IEEE-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-IEEE-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s4
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v1, v3, v1
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1492,19 +1488,17 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-FLUSH:       ; %bb.0:
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s4, 0x6f800000
-; GFX10-FLUSH-NEXT:    s_mov_b32 s5, 0x2f800000
-; GFX10-FLUSH-NEXT:    v_cmp_gt_f32_e64 s6, |v2|, s4
-; GFX10-FLUSH-NEXT:    v_cmp_gt_f32_e64 s4, |v3|, s4
-; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v4, 1.0, s5, s6
-; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v5, 1.0, s5, s4
+; GFX10-FLUSH-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v2|
+; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4
+; GFX10-FLUSH-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v3|
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v5, v1
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 1c3c79f8b867d..9cb1f58f4ad32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -96,12 +96,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -156,12 +155,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s32
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s32, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, s32, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -327,14 +325,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x104
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, off offset:4 glc dlc
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -396,16 +393,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s32 glc dlc
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, vcc_lo, v0
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, vcc_lo, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -548,14 +545,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x4004
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, off offset:4 glc dlc
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -619,16 +615,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s32 offset:4 glc dlc
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, vcc_lo, v0
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, vcc_lo, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
index 5d8ae58512663..9f7fe19a17ed9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
@@ -325,10 +325,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s2, 0x80000000
-; GFX10-NEXT:    v_sub_f32_e32 v1, s2, v1
-; GFX10-NEXT:    v_sub_f32_e64 v3, s2, |v3|
+; GFX10-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
+; GFX10-NEXT:    v_sub_f32_e64 v3, 0x80000000, |v3|
 ; GFX10-NEXT:    v_med3_f32 v1, v1, |v2|, v3
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -447,11 +445,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s2, 0x80000000
-; GFX10-NEXT:    v_sub_f32_e64 v1, s2, |v1|
-; GFX10-NEXT:    v_sub_f32_e64 v2, s2, |v2|
-; GFX10-NEXT:    v_sub_f32_e64 v3, s2, |v3|
+; GFX10-NEXT:    v_sub_f32_e64 v1, 0x80000000, |v1|
+; GFX10-NEXT:    v_sub_f32_e64 v2, 0x80000000, |v2|
+; GFX10-NEXT:    v_sub_f32_e64 v3, 0x80000000, |v3|
 ; GFX10-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 4667975ca0d01..7843173f11b37 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -473,9 +473,8 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX10-NEXT:    v_log_f16_e32 v2, v0
 ; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index bb019f99b5ff1..cb407ee289800 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -9,8 +9,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_and_b32 s2, s2, 0x7f
 ; GFX6-NEXT:    s_movk_i32 s3, 0x7f
-; GFX6-NEXT:    s_and_b32 s2, s2, s3
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x60001
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -39,9 +39,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_and_b32 s2, s2, 0x7f
+; GFX8-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX8-NEXT:    s_movk_i32 s3, 0x7f
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
-; GFX8-NEXT:    s_and_b32 s1, s1, s3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -71,9 +71,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_and_b32 s2, s2, 0x7f
+; GFX9-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX9-NEXT:    s_movk_i32 s3, 0x7f
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
-; GFX9-NEXT:    s_and_b32 s1, s1, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -102,11 +102,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX10-LABEL: s_fshl_i7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
-; GFX10-NEXT:    s_movk_i32 s3, 0x7f
-; GFX10-NEXT:    s_and_b32 s2, s2, s3
-; GFX10-NEXT:    s_and_b32 s1, s1, s3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_and_b32 s2, s2, 0x7f
+; GFX10-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -123,8 +122,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u16 v1, 6, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, s0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v1, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -250,10 +249,9 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7f
-; GFX10-NEXT:    v_sub_nc_u16 v4, 6, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, v4, v3
+; GFX10-NEXT:    v_sub_nc_u16 v3, 6, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0x7f, v3
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -541,68 +539,65 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, s2
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s4
-; GFX6-NEXT:    s_movk_i32 s6, 0xff
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
-; GFX6-NEXT:    s_and_b32 s1, s1, s6
-; GFX6-NEXT:    s_and_b32 s0, s0, s6
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s6, s2, 7
-; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX8-NEXT:    s_movk_i32 s6, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX8-NEXT:    s_and_b32 s1, s1, s6
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX8-NEXT:    s_and_b32 s6, s2, 7
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 7
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s3, s4, s6
+; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s6
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
-; GFX8-NEXT:    s_and_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s6, s2, 7
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX9-NEXT:    s_movk_i32 s6, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX9-NEXT:    s_and_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX9-NEXT:    s_and_b32 s6, s2, 7
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s5, 7
 ; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s3, s4, s6
+; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s5
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s2, 8, 0x100000
-; GFX9-NEXT:    s_and_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -610,28 +605,27 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-LABEL: s_fshl_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX10-NEXT:    s_movk_i32 s6, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX10-NEXT:    s_and_b32 s4, s4, s6
-; GFX10-NEXT:    s_and_b32 s7, s2, 7
-; GFX10-NEXT:    s_and_b32 s1, s1, s6
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    s_and_b32 s6, s2, 7
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
-; GFX10-NEXT:    s_and_b32 s7, s5, 7
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX10-NEXT:    s_and_b32 s6, s5, 7
 ; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s7
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s2, s3, s4
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s1, s2, s6
+; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX10-NEXT:    s_bfe_u32 s2, 8, 0x100000
-; GFX10-NEXT:    s_and_b32 s0, s0, s6
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -729,20 +723,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX10-NEXT:    v_lshrrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, v3, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
+; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v7, v1
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v4
@@ -785,22 +779,21 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_bfe_u32 s4, s1, 0x80010
 ; GFX6-NEXT:    s_andn2_b32 s6, 7, s7
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX6-NEXT:    s_movk_i32 s10, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s6
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_and_b32 s4, s8, 7
 ; GFX6-NEXT:    s_andn2_b32 s6, 7, s8
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 25
-; GFX6-NEXT:    s_and_b32 s2, s2, s10
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX6-NEXT:    s_and_b32 s0, s0, s10
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_or_b32 s1, s4, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s2, s3, s10
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s10
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -808,11 +801,10 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ;
 ; GFX8-LABEL: s_fshl_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s13, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX8-NEXT:    s_and_b32 s1, s1, s13
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
@@ -828,7 +820,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s9, 7
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s3, s6, s13
+; GFX8-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
@@ -836,37 +828,36 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s2, s10, 7
 ; GFX8-NEXT:    s_lshl_b32 s2, s4, s2
-; GFX8-NEXT:    s_and_b32 s4, s7, s13
+; GFX8-NEXT:    s_and_b32 s4, s7, 0xff
 ; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
 ; GFX8-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX8-NEXT:    s_lshr_b32 s3, s4, s3
 ; GFX8-NEXT:    s_or_b32 s2, s2, s3
 ; GFX8-NEXT:    s_and_b32 s3, s11, 7
-; GFX8-NEXT:    s_and_b32 s1, s1, s13
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
 ; GFX8-NEXT:    s_lshl_b32 s3, s5, s3
 ; GFX8-NEXT:    s_lshr_b32 s5, s8, 1
-; GFX8-NEXT:    s_and_b32 s0, s0, s13
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s2, s13
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, s13
+; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s13, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX9-NEXT:    s_and_b32 s1, s1, s13
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
@@ -882,7 +873,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s9, 7
 ; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s3, s6, s13
+; GFX9-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
@@ -890,79 +881,78 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s2, s10, 7
 ; GFX9-NEXT:    s_lshl_b32 s2, s4, s2
-; GFX9-NEXT:    s_and_b32 s4, s7, s13
+; GFX9-NEXT:    s_and_b32 s4, s7, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX9-NEXT:    s_lshr_b32 s3, s4, s3
 ; GFX9-NEXT:    s_or_b32 s2, s2, s3
 ; GFX9-NEXT:    s_and_b32 s3, s11, 7
-; GFX9-NEXT:    s_and_b32 s1, s1, s13
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
 ; GFX9-NEXT:    s_lshl_b32 s3, s5, s3
 ; GFX9-NEXT:    s_lshr_b32 s5, s8, 1
-; GFX9-NEXT:    s_and_b32 s0, s0, s13
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s5, s4
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s2, s13
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s3, s13
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_movk_i32 s11, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX10-NEXT:    s_and_b32 s1, s1, s11
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s12, s2, 24
-; GFX10-NEXT:    s_and_b32 s13, s2, 7
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX10-NEXT:    s_and_b32 s12, s2, 7
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX10-NEXT:    s_and_b32 s2, s6, s11
+; GFX10-NEXT:    s_and_b32 s2, s6, 0xff
 ; GFX10-NEXT:    s_and_b32 s6, s9, 7
 ; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s13
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s12
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, s9
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_or_b32 s1, s3, s2
-; GFX10-NEXT:    s_and_b32 s2, s7, s11
+; GFX10-NEXT:    s_and_b32 s2, s7, 0xff
 ; GFX10-NEXT:    s_and_b32 s3, s10, 7
 ; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX10-NEXT:    s_andn2_b32 s6, 7, s10
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, s6
-; GFX10-NEXT:    s_and_b32 s4, s12, 7
-; GFX10-NEXT:    s_andn2_b32 s6, 7, s12
+; GFX10-NEXT:    s_and_b32 s4, s11, 7
+; GFX10-NEXT:    s_andn2_b32 s6, 7, s11
 ; GFX10-NEXT:    s_lshr_b32 s7, s8, 1
 ; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX10-NEXT:    s_lshr_b32 s5, s7, s6
 ; GFX10-NEXT:    s_or_b32 s2, s3, s2
-; GFX10-NEXT:    s_and_b32 s1, s1, s11
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_or_b32 s3, s4, s5
-; GFX10-NEXT:    s_and_b32 s0, s0, s11
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s2, s2, s11
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s3, s11
+; GFX10-NEXT:    s_and_b32 s2, s3, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -1136,51 +1126,50 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_lshlrev_b16 v0, v11, v0
-; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v8
+; GFX10-NEXT:    v_lshlrev_b16 v0, v10, v0
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v8
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, s4, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX10-NEXT:    v_lshlrev_b16 v3, v8, v3
-; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v9
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v11
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v2
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_lshrrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX10-NEXT:    v_lshrrev_b16 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX10-NEXT:    v_lshrrev_b16 v12, 1, v12
-; GFX10-NEXT:    v_lshrrev_b16 v6, v11, v6
-; GFX10-NEXT:    v_lshlrev_b16 v4, v9, v4
+; GFX10-NEXT:    v_lshrrev_b16 v6, v10, v6
+; GFX10-NEXT:    v_lshlrev_b16 v4, v11, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v2, v2, v5
 ; GFX10-NEXT:    v_lshrrev_b16 v5, v13, v7
-; GFX10-NEXT:    v_lshrrev_b16 v7, v10, v12
+; GFX10-NEXT:    v_lshrrev_b16 v7, v9, v12
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
@@ -1199,8 +1188,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xffffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s3
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x170001
@@ -1230,8 +1219,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffffff
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x170001
@@ -1261,8 +1250,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; GFX9-NEXT:    s_mov_b32 s3, 0xffffff
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x170001
@@ -1289,9 +1278,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX10-LABEL: s_fshl_i24:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x170001
-; GFX10-NEXT:    s_and_b32 s2, s2, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -1308,8 +1296,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 23, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v1, v1, s1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v0, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -1433,11 +1421,10 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffff
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 23, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, v4, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt)
@@ -1449,25 +1436,23 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_movk_i32 s9, 0xff
-; GFX6-NEXT:    s_mov_b32 s11, 0x80008
 ; GFX6-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX6-NEXT:    s_and_b32 s10, s0, s9
-; GFX6-NEXT:    s_bfe_u32 s0, s0, s11
+; GFX6-NEXT:    s_and_b32 s10, s0, 0xff
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX6-NEXT:    s_and_b32 s6, s6, s9
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    s_or_b32 s0, s10, s0
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s9
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX6-NEXT:    s_and_b32 s6, s8, s9
+; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -1476,20 +1461,20 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_or_b32 s1, s1, s6
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s2, 24
-; GFX6-NEXT:    s_and_b32 s10, s2, s9
-; GFX6-NEXT:    s_bfe_u32 s2, s2, s11
+; GFX6-NEXT:    s_and_b32 s10, s2, 0xff
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x80008
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX6-NEXT:    s_and_b32 s6, s6, s9
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s10, s2
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s8, s3, 8
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_and_b32 s3, s3, s9
+; GFX6-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX6-NEXT:    s_and_b32 s6, s8, s9
+; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_or_b32 s3, s7, s3
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
@@ -1498,10 +1483,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_or_b32 s3, s3, s6
 ; GFX6-NEXT:    s_lshr_b32 s6, s4, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 24
-; GFX6-NEXT:    s_and_b32 s10, s4, s9
-; GFX6-NEXT:    s_bfe_u32 s4, s4, s11
+; GFX6-NEXT:    s_and_b32 s10, s4, 0xff
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x80008
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX6-NEXT:    s_and_b32 s6, s6, s9
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX6-NEXT:    s_or_b32 s4, s10, s4
@@ -1515,14 +1500,14 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    s_lshr_b32 s8, s5, 8
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX6-NEXT:    s_and_b32 s5, s5, s9
+; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX6-NEXT:    s_and_b32 s6, s8, s9
+; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    s_or_b32 s5, s7, s5
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
@@ -1555,6 +1540,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    s_lshr_b32 s0, s3, 1
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    s_movk_i32 s9, 0xff
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
@@ -1579,25 +1565,24 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-LABEL: s_fshl_v2i24:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX8-NEXT:    s_movk_i32 s10, 0xff
-; GFX8-NEXT:    s_and_b32 s6, s6, s10
-; GFX8-NEXT:    s_bfe_u32 s11, 8, 0x100000
+; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX8-NEXT:    s_bfe_u32 s10, 8, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX8-NEXT:    s_and_b32 s0, s0, s10
-; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s10
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
-; GFX8-NEXT:    s_and_b32 s6, s7, s10
+; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s10
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s11
-; GFX8-NEXT:    s_and_b32 s6, s9, s10
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s10
+; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
 ; GFX8-NEXT:    s_or_b32 s1, s8, s1
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -1606,23 +1591,23 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 8
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s6, s6, s10
+; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
-; GFX8-NEXT:    s_and_b32 s2, s2, s10
-; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s10
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
-; GFX8-NEXT:    s_and_b32 s6, s7, s10
+; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, s10
+; GFX8-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
-; GFX8-NEXT:    s_lshl_b32 s3, s3, s11
-; GFX8-NEXT:    s_and_b32 s6, s9, s10
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s10
+; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
 ; GFX8-NEXT:    s_or_b32 s3, s8, s3
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
@@ -1630,13 +1615,13 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s4, 8
-; GFX8-NEXT:    s_and_b32 s6, s6, s10
+; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 24
-; GFX8-NEXT:    s_and_b32 s4, s4, s10
-; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s10
 ; GFX8-NEXT:    s_or_b32 s4, s4, s6
-; GFX8-NEXT:    s_and_b32 s6, s7, s10
+; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
@@ -1649,14 +1634,14 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX8-NEXT:    s_lshr_b32 s9, s5, 8
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX8-NEXT:    s_and_b32 s5, s5, s10
+; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GFX8-NEXT:    s_lshl_b32 s5, s5, s11
+; GFX8-NEXT:    s_lshl_b32 s5, s5, s10
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX8-NEXT:    s_and_b32 s6, s9, s10
+; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    s_or_b32 s5, s8, s5
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
@@ -1698,7 +1683,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, s10, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
@@ -1712,27 +1697,26 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
-; GFX9-NEXT:    s_movk_i32 s12, 0xff
-; GFX9-NEXT:    s_and_b32 s7, s7, s12
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX9-NEXT:    s_bfe_u32 s12, 8, 0x100000
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    s_bfe_u32 s13, 8, 0x100000
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s0, 24
-; GFX9-NEXT:    s_and_b32 s0, s0, s12
-; GFX9-NEXT:    s_lshl_b32 s7, s7, s13
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s12
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
-; GFX9-NEXT:    s_and_b32 s7, s9, s12
+; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX9-NEXT:    s_lshr_b32 s11, s1, 8
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_and_b32 s1, s1, s12
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s13
-; GFX9-NEXT:    s_and_b32 s7, s11, s12
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s12
+; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
 ; GFX9-NEXT:    s_or_b32 s1, s10, s1
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -1740,24 +1724,24 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s1, s1, s7
 ; GFX9-NEXT:    s_lshr_b32 s7, s2, 8
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_and_b32 s7, s7, s12
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX9-NEXT:    s_and_b32 s2, s2, s12
-; GFX9-NEXT:    s_lshl_b32 s7, s7, s13
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s12
 ; GFX9-NEXT:    s_or_b32 s2, s2, s7
-; GFX9-NEXT:    s_and_b32 s7, s9, s12
+; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX9-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, s12
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    s_or_b32 s2, s2, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s13
-; GFX9-NEXT:    s_and_b32 s7, s11, s12
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s12
+; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
 ; GFX9-NEXT:    s_or_b32 s3, s10, s3
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
@@ -1765,14 +1749,14 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s3, s3, s7
 ; GFX9-NEXT:    s_lshr_b32 s7, s4, 8
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX9-NEXT:    s_and_b32 s7, s7, s12
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX9-NEXT:    s_and_b32 s4, s4, s12
-; GFX9-NEXT:    s_lshl_b32 s7, s7, s13
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s12
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
-; GFX9-NEXT:    s_and_b32 s7, s9, s12
+; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
@@ -1780,10 +1764,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX9-NEXT:    s_and_b32 s5, s5, s12
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX9-NEXT:    s_lshl_b32 s5, s5, s13
-; GFX9-NEXT:    s_and_b32 s7, s11, s12
+; GFX9-NEXT:    s_lshl_b32 s5, s5, s12
+; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
 ; GFX9-NEXT:    s_or_b32 s5, s10, s5
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
@@ -1822,10 +1806,11 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
 ; GFX9-NEXT:    s_mov_b32 s6, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v1, s1, v1, v2
+; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    s_mov_b32 s8, 16
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_b32_e32 v3, s12, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v0, s12, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v3
@@ -1840,120 +1825,117 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
-; GFX10-NEXT:    s_movk_i32 s9, 0xff
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 8
-; GFX10-NEXT:    s_bfe_u32 s11, 8, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    s_and_b32 s1, s1, s9
-; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX10-NEXT:    s_bfe_u32 s10, 8, 0x100000
 ; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s11
-; GFX10-NEXT:    s_and_b32 s6, s6, s9
-; GFX10-NEXT:    s_or_b32 s1, s8, s1
-; GFX10-NEXT:    s_lshr_b32 s8, s4, 8
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10-NEXT:    s_lshl_b32 s6, s6, s10
+; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    s_bfe_u32 s6, s7, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s7, s4, 8
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX10-NEXT:    s_and_b32 s0, s0, s9
-; GFX10-NEXT:    s_lshl_b32 s6, s6, s11
-; GFX10-NEXT:    s_and_b32 s8, s8, s9
+; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s12, s4, 24
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    s_or_b32 s0, s0, s6
-; GFX10-NEXT:    s_and_b32 s6, s7, s9
-; GFX10-NEXT:    s_and_b32 s7, s10, s9
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    s_lshl_b32 s7, s7, s10
+; GFX10-NEXT:    s_lshr_b32 s13, s5, 8
 ; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v1
-; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX10-NEXT:    s_lshr_b32 s12, s4, 24
-; GFX10-NEXT:    s_and_b32 s4, s4, s9
-; GFX10-NEXT:    s_lshl_b32 s8, s8, s11
-; GFX10-NEXT:    s_lshr_b32 s13, s5, 8
-; GFX10-NEXT:    s_or_b32 s4, s4, s8
+; GFX10-NEXT:    s_or_b32 s4, s4, s7
+; GFX10-NEXT:    s_and_b32 s7, s11, 0xff
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX10-NEXT:    s_and_b32 s8, s10, s9
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX10-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX10-NEXT:    s_and_b32 s5, s5, s9
-; GFX10-NEXT:    s_or_b32 s4, s4, s8
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX10-NEXT:    s_lshl_b32 s5, s5, s11
-; GFX10-NEXT:    s_and_b32 s8, s13, s9
+; GFX10-NEXT:    s_lshl_b32 s5, s5, s10
+; GFX10-NEXT:    s_or_b32 s4, s4, s7
+; GFX10-NEXT:    s_and_b32 s7, s13, 0xff
 ; GFX10-NEXT:    s_or_b32 s5, s12, s5
-; GFX10-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX10-NEXT:    s_or_b32 s5, s5, s7
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_or_b32 s5, s5, s8
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX10-NEXT:    s_and_b32 s8, s8, s9
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_and_b32 s7, s9, 0xff
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s10
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 16
+; GFX10-NEXT:    s_or_b32 s1, s8, s1
+; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX10-NEXT:    s_and_b32 s12, s2, s9
-; GFX10-NEXT:    s_lshl_b32 s8, s8, s11
-; GFX10-NEXT:    s_and_b32 s10, s10, s9
-; GFX10-NEXT:    s_or_b32 s8, s12, s8
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
-; GFX10-NEXT:    s_bfe_u32 s4, s8, 0x100000
-; GFX10-NEXT:    s_bfe_u32 s8, s10, 0x100000
+; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_lshl_b32 s8, s8, s10
 ; GFX10-NEXT:    s_bfe_u32 s7, s7, 0x100000
-; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX10-NEXT:    s_or_b32 s2, s2, s8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s5, v1
+; GFX10-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX10-NEXT:    s_and_b32 s5, s9, 0xff
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_lshl_b32 s5, s8, 16
-; GFX10-NEXT:    s_lshr_b32 s8, s3, 8
-; GFX10-NEXT:    s_and_b32 s3, s3, s9
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s10
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s11
-; GFX10-NEXT:    s_or_b32 s4, s4, s5
-; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX10-NEXT:    s_or_b32 s3, s11, s3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_and_b32 s3, s8, s9
-; GFX10-NEXT:    s_mov_b32 s5, 0xffffff
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffff
-; GFX10-NEXT:    s_or_b32 s2, s2, s3
-; GFX10-NEXT:    s_lshr_b32 s3, s4, 1
-; GFX10-NEXT:    v_and_b32_e32 v2, s5, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 23, v1
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX10-NEXT:    v_and_b32_e32 v0, s5, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e64 v2, v2, s3
-; GFX10-NEXT:    v_and_b32_e32 v4, v4, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX10-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX10-NEXT:    s_lshr_b32 s2, s3, 1
 ; GFX10-NEXT:    s_or_b32 s0, s0, s6
 ; GFX10-NEXT:    s_or_b32 s1, s1, s7
-; GFX10-NEXT:    v_lshrrev_b32_e64 v3, v4, s2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e64 v3, v3, s2
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    v_lshl_or_b32 v1, s1, v1, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshl_or_b32 v1, s1, v1, v3
 ; GFX10-NEXT:    s_mov_b32 s0, 16
-; GFX10-NEXT:    v_and_b32_e32 v3, s9, v1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s9, v2
+; GFX10-NEXT:    v_and_or_b32 v2, v0, 0xff, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX10-NEXT:    v_bfe_u32 v1, v1, 16, 8
@@ -2141,12 +2123,11 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffffff
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX10-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
 ; GFX10-NEXT:    v_bfe_u32 v3, v3, 1, 23
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
@@ -2177,12 +2158,12 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v4, v4, v10
-; GFX10-NEXT:    v_and_b32_e32 v6, v6, v10
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, v7, v10
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v5, v3
@@ -3176,19 +3157,18 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-LABEL: s_fshl_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s6, s4, 15
-; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX6-NEXT:    s_mov_b32 s6, 0xf0001
-; GFX6-NEXT:    s_bfe_u32 s2, s2, s6
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s5, 15
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX6-NEXT:    s_bfe_u32 s2, s3, s6
+; GFX6-NEXT:    s_bfe_u32 s2, s3, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s3, s4, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
@@ -3229,22 +3209,20 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX9-LABEL: s_fshl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000f
-; GFX9-NEXT:    s_and_b32 s4, s2, s3
-; GFX9-NEXT:    s_andn2_b32 s2, s3, s2
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_and_b32 s3, s2, 0xf000f
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s4, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_and_b32 s1, s1, s4
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 0x10001
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX9-NEXT:    s_andn2_b32 s2, 0xf000f, s2
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_and_b32 s1, s1, s4
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s3, s4
@@ -3254,22 +3232,20 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX10-LABEL: s_fshl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s5, 0xffff
-; GFX10-NEXT:    s_mov_b32 s3, 0xf000f
-; GFX10-NEXT:    s_and_b32 s7, s1, s5
+; GFX10-NEXT:    s_and_b32 s6, s1, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s7, s7, 0x10001
+; GFX10-NEXT:    s_and_b32 s3, s2, 0xf000f
+; GFX10-NEXT:    s_lshr_b32 s6, s6, 0x10001
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_and_b32 s4, s2, s3
-; GFX10-NEXT:    s_andn2_b32 s2, s3, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s7, s1
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX10-NEXT:    s_andn2_b32 s2, 0xf000f, s2
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s6, s1
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s3, s4, s5
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_and_b32 s1, s1, s5
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_lshr_b32 s2, s4, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
@@ -3347,10 +3323,9 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3420,19 +3395,18 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT:    s_mov_b32 s0, 0xf0001
-; GFX6-NEXT:    s_bfe_u32 s2, s2, s0
+; GFX6-NEXT:    s_bfe_u32 s0, s2, 0xf0001
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT:    v_lshr_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT:    s_bfe_u32 s0, s3, s0
+; GFX6-NEXT:    s_bfe_u32 s0, s3, 0xf0001
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s1, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
@@ -3488,13 +3462,12 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX10-LABEL: v_fshl_v2i16_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
-; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v0, s2, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xf000f, v0
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 0x10001
-; GFX10-NEXT:    s_lshr_b32 s2, s3, 1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xf000f, v1
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, s0
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v1, s1
@@ -3556,13 +3529,12 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ;
 ; GFX9-LABEL: v_fshl_v2i16_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX9-NEXT:    s_and_b32 s3, s1, s2
-; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX9-NEXT:    s_lshl_b32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s2, s1, 0xf000f
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX9-NEXT:    s_andn2_b32 s1, 0xf000f, s1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s3, s4
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s1, v0
@@ -3571,15 +3543,14 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ;
 ; GFX10-LABEL: v_fshl_v2i16_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    s_and_b32 s3, s1, s2
-; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
-; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX10-NEXT:    s_and_b32 s2, s1, 0xf000f
+; GFX10-NEXT:    s_andn2_b32 s1, 0xf000f, s1
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, s1, v0
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX10-NEXT:    s_lshl_b32 s1, s2, s4
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshl_b32 s1, s3, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -3592,19 +3563,18 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-LABEL: v_fshl_v2i16_vss:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s4, s2, 15
-; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, 0xf0001
-; GFX6-NEXT:    s_bfe_u32 s0, s0, s4
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT:    s_bfe_u32 s0, s1, s4
+; GFX6-NEXT:    s_bfe_u32 s0, s1, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
@@ -3642,18 +3612,16 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX9-LABEL: v_fshl_v2i16_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX9-NEXT:    s_and_b32 s3, s1, s2
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
-; GFX9-NEXT:    s_mov_b32 s3, 0xffff
-; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX9-NEXT:    s_and_b32 s2, s1, 0xf000f
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s2, v0
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_and_b32 s0, s0, s3
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 0x10001
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_andn2_b32 s1, 0xf000f, s1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_and_b32 s0, s0, s3
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshr_b32 s1, s2, s3
@@ -3663,18 +3631,16 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX10-LABEL: v_fshl_v2i16_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s3, 0xffff
-; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX10-NEXT:    s_and_b32 s5, s0, s3
+; GFX10-NEXT:    s_and_b32 s3, s0, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s5, 0x10001
+; GFX10-NEXT:    s_lshr_b32 s3, s3, 0x10001
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s4, s1, s2
-; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s5, s0
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s4, v0
+; GFX10-NEXT:    s_and_b32 s2, s1, 0xf000f
+; GFX10-NEXT:    s_andn2_b32 s1, 0xf000f, s1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s2, v0
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX10-NEXT:    s_and_b32 s0, s0, s3
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshr_b32 s1, s2, s3
@@ -3704,19 +3670,18 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-LABEL: s_fshl_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s12, s8, 15
-; GFX6-NEXT:    s_bfe_u32 s12, s12, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s8, 15, s8
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s12
-; GFX6-NEXT:    s_mov_b32 s12, 0xf0001
-; GFX6-NEXT:    s_bfe_u32 s4, s4, s12
+; GFX6-NEXT:    s_bfe_u32 s12, s12, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s12
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s4
 ; GFX6-NEXT:    s_and_b32 s4, s9, 15
 ; GFX6-NEXT:    s_andn2_b32 s8, 15, s9
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX6-NEXT:    s_bfe_u32 s4, s5, s12
+; GFX6-NEXT:    s_bfe_u32 s4, s5, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s5, s8, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_or_b32 s1, s1, s4
@@ -3724,7 +3689,7 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s10
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s4
-; GFX6-NEXT:    s_bfe_u32 s4, s6, s12
+; GFX6-NEXT:    s_bfe_u32 s4, s6, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_or_b32 s2, s2, s4
@@ -3732,7 +3697,7 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s11
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s4
-; GFX6-NEXT:    s_bfe_u32 s4, s7, s12
+; GFX6-NEXT:    s_bfe_u32 s4, s7, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -3801,42 +3766,39 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ;
 ; GFX9-LABEL: s_fshl_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
-; GFX9-NEXT:    s_and_b32 s7, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s7
-; GFX9-NEXT:    s_lshl_b32 s7, s9, s10
-; GFX9-NEXT:    s_mov_b32 s9, 0xffff
-; GFX9-NEXT:    s_mov_b32 s8, 0x10001
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
-; GFX9-NEXT:    s_and_b32 s2, s2, s9
-; GFX9-NEXT:    s_lshr_b32 s2, s2, s8
-; GFX9-NEXT:    s_lshr_b32 s7, s7, 1
-; GFX9-NEXT:    s_andn2_b32 s4, s6, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
-; GFX9-NEXT:    s_and_b32 s2, s2, s9
-; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX9-NEXT:    s_and_b32 s6, s4, 0xf000f
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX9-NEXT:    s_lshl_b32 s6, s7, s8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 0x10001
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX9-NEXT:    s_andn2_b32 s4, 0xf000f, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
-; GFX9-NEXT:    s_lshr_b32 s4, s7, s10
+; GFX9-NEXT:    s_lshr_b32 s4, s6, s7
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s5, s6
-; GFX9-NEXT:    s_andn2_b32 s4, s6, s5
+; GFX9-NEXT:    s_and_b32 s2, s5, 0xf000f
+; GFX9-NEXT:    s_andn2_b32 s4, 0xf000f, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX9-NEXT:    s_lshl_b32 s2, s5, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, s9
-; GFX9-NEXT:    s_lshr_b32 s3, s3, s8
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 0x10001
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX9-NEXT:    s_and_b32 s2, s2, s9
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, s5
@@ -3846,40 +3808,37 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ;
 ; GFX10-LABEL: s_fshl_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s10, 0xffff
-; GFX10-NEXT:    s_mov_b32 s6, 0xf000f
-; GFX10-NEXT:    s_mov_b32 s8, 0x10001
-; GFX10-NEXT:    s_and_b32 s12, s2, s10
+; GFX10-NEXT:    s_and_b32 s9, s2, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX10-NEXT:    s_and_b32 s7, s4, s6
-; GFX10-NEXT:    s_lshr_b32 s12, s12, s8
+; GFX10-NEXT:    s_and_b32 s6, s4, 0xf000f
+; GFX10-NEXT:    s_lshr_b32 s9, s9, 0x10001
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX10-NEXT:    s_andn2_b32 s4, s6, s4
-; GFX10-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s7, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
-; GFX10-NEXT:    s_lshl_b32 s7, s9, s11
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s2, s10
-; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX10-NEXT:    s_andn2_b32 s4, 0xf000f, s4
+; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s9, s2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX10-NEXT:    s_lshl_b32 s6, s7, s8
+; GFX10-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s9, s11
-; GFX10-NEXT:    s_and_b32 s9, s3, s10
+; GFX10-NEXT:    s_lshr_b32 s4, s7, s8
+; GFX10-NEXT:    s_and_b32 s8, s3, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
-; GFX10-NEXT:    s_and_b32 s4, s5, s6
-; GFX10-NEXT:    s_lshr_b32 s8, s9, s8
+; GFX10-NEXT:    s_and_b32 s4, s5, 0xf000f
+; GFX10-NEXT:    s_lshr_b32 s8, s8, 0x10001
 ; GFX10-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
-; GFX10-NEXT:    s_andn2_b32 s5, s6, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX10-NEXT:    s_andn2_b32 s5, 0xf000f, s5
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s8, s3
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX10-NEXT:    s_lshl_b32 s4, s6, s7
 ; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX10-NEXT:    s_and_b32 s3, s3, s10
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s7, s5, 16
 ; GFX10-NEXT:    s_lshr_b32 s3, s3, s5
 ; GFX10-NEXT:    s_lshr_b32 s5, s6, s7
@@ -4003,16 +3962,15 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
-; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xf000f, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xf000f, v6
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xf000f, v7
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
-; GFX10-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v5, v1
+; GFX10-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v7, v3
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -4893,14 +4851,13 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7f
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX10-NEXT:    v_and_b32_e32 v18, s4, v8
+; GFX10-NEXT:    v_and_b32_e32 v18, 0x7f, v8
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 31, v6
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v18
-; GFX10-NEXT:    v_and_b32_e32 v19, s4, v8
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v18, v[2:3]
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v12
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
@@ -5104,44 +5061,43 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ;
 ; GFX10-LABEL: v_fshl_i128_ssv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_movk_i32 s9, 0x7f
-; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v0
-; GFX10-NEXT:    v_and_b32_e32 v12, s9, v0
+; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    v_and_b32_e32 v13, s9, v4
+; GFX10-NEXT:    s_lshl_b32 s9, s6, 31
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
+; GFX10-NEXT:    v_and_b32_e32 v13, 0x7f, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX10-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 64, v13
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v1
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v0, s[6:7]
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX10-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s8, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, s2, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, s3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s8, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s9, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
@@ -6343,14 +6299,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_movk_i32 s7, 0x7f
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX10-NEXT:    v_and_b32_e32 v27, s7, v16
+; GFX10-NEXT:    v_and_b32_e32 v27, 0x7f, v16
 ; GFX10-NEXT:    v_xor_b32_e32 v16, -1, v16
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 31, v10
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
-; GFX10-NEXT:    v_and_b32_e32 v28, s7, v16
+; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
 ; GFX10-NEXT:    v_or_b32_e32 v9, v9, v21
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
@@ -6382,7 +6337,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v9, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v0, s4
-; GFX10-NEXT:    v_and_b32_e32 v23, s7, v20
+; GFX10-NEXT:    v_and_b32_e32 v23, 0x7f, v20
 ; GFX10-NEXT:    v_or_b32_e32 v0, v21, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v20
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, v1, s4
@@ -6391,7 +6346,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 31, v14
-; GFX10-NEXT:    v_and_b32_e32 v25, s7, v3
+; GFX10-NEXT:    v_and_b32_e32 v25, 0x7f, v3
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v23, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 9e76467fd55e2..52f78974d9aa6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -9,12 +9,12 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_and_b32 s2, s2, 0x7f
 ; GFX6-NEXT:    s_movk_i32 s3, 0x7f
-; GFX6-NEXT:    s_and_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_and_b32 s1, s1, s3
+; GFX6-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX6-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -40,12 +40,12 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_and_b32 s2, s2, 0x7f
 ; GFX8-NEXT:    s_movk_i32 s3, 0x7f
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX8-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -71,12 +71,12 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_and_b32 s2, s2, 0x7f
 ; GFX9-NEXT:    s_movk_i32 s3, 0x7f
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s1, s1, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX9-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
@@ -101,10 +101,9 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX10-LABEL: s_fshr_i7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
-; GFX10-NEXT:    s_movk_i32 s3, 0x7f
+; GFX10-NEXT:    s_and_b32 s2, s2, 0x7f
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s2, s2, s3
-; GFX10-NEXT:    s_and_b32 s1, s1, s3
+; GFX10-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -121,8 +120,8 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u16 v1, 6, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX10-NEXT:    v_lshrrev_b16 v0, v0, s1
 ; GFX10-NEXT:    v_lshlrev_b16 v1, v1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -249,12 +248,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7f
-; GFX10-NEXT:    v_sub_nc_u16 v4, 6, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, v4, v3
+; GFX10-NEXT:    v_sub_nc_u16 v3, 6, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0x7f, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v4, v0
+; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt)
@@ -530,10 +528,9 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX6-NEXT:    s_lshr_b32 s4, s2, 8
 ; GFX6-NEXT:    s_and_b32 s5, s2, 7
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX6-NEXT:    s_movk_i32 s6, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s2, s1, s6
+; GFX6-NEXT:    s_and_b32 s2, s1, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s5
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 7
@@ -543,8 +540,8 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX6-NEXT:    s_or_b32 s1, s3, s1
-; GFX6-NEXT:    s_and_b32 s1, s1, s6
-; GFX6-NEXT:    s_and_b32 s0, s0, s6
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -552,28 +549,27 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX8-LABEL: s_fshr_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX8-NEXT:    s_and_b32 s6, s2, 7
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s5
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX8-NEXT:    s_and_b32 s4, s4, s2
+; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 7
-; GFX8-NEXT:    s_andn2_b32 s5, 7, s5
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX8-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX8-NEXT:    s_lshr_b32 s1, s4, s1
-; GFX8-NEXT:    s_or_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -581,28 +577,27 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX9-LABEL: s_fshr_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX9-NEXT:    s_and_b32 s6, s2, 7
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    s_movk_i32 s2, 0xff
-; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX9-NEXT:    s_and_b32 s1, s1, s2
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s5
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX9-NEXT:    s_and_b32 s4, s4, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s5, 7
-; GFX9-NEXT:    s_andn2_b32 s5, 7, s5
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX9-NEXT:    s_lshr_b32 s1, s4, s1
-; GFX9-NEXT:    s_or_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX9-NEXT:    s_or_b32 s1, s2, s1
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s2, 8, 0x100000
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -610,14 +605,13 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-LABEL: s_fshr_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX10-NEXT:    s_movk_i32 s7, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX10-NEXT:    s_and_b32 s6, s2, 7
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s4, s4, s7
-; GFX10-NEXT:    s_and_b32 s1, s1, s7
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_and_b32 s2, s5, 7
 ; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
@@ -629,9 +623,9 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s6
 ; GFX10-NEXT:    s_or_b32 s2, s3, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s1, s2, s7
+; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX10-NEXT:    s_bfe_u32 s2, 8, 0x100000
-; GFX10-NEXT:    s_and_b32 s0, s0, s7
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -729,18 +723,18 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_and_b32_e32 v7, 7, v2
-; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v3, v3, v5
+; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v6, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v7, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
@@ -768,10 +762,9 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX6-NEXT:    s_and_b32 s10, s2, 7
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX6-NEXT:    s_movk_i32 s11, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s2, s1, s11
+; GFX6-NEXT:    s_and_b32 s2, s1, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s10
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s7, 7
@@ -792,24 +785,23 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_and_b32 s3, s9, 7
 ; GFX6-NEXT:    s_andn2_b32 s4, 7, s9
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX6-NEXT:    s_and_b32 s2, s2, s11
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX6-NEXT:    s_lshr_b32 s3, s6, s3
-; GFX6-NEXT:    s_and_b32 s0, s0, s11
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX6-NEXT:    s_and_b32 s1, s1, s11
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_or_b32 s3, s4, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, s11
+; GFX6-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s13, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
@@ -822,14 +814,14 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_and_b32 s12, s2, 7
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    s_and_b32 s1, s1, s13
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s12
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX8-NEXT:    s_and_b32 s3, s6, s13
+; GFX8-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s9, 7
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
@@ -837,33 +829,32 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
 ; GFX8-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX8-NEXT:    s_and_b32 s4, s7, s13
+; GFX8-NEXT:    s_and_b32 s4, s7, 0xff
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    s_and_b32 s2, s10, 7
 ; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s13
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_or_b32 s2, s3, s2
 ; GFX8-NEXT:    s_and_b32 s3, s11, 7
 ; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX8-NEXT:    s_and_b32 s0, s0, s13
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s8, s3
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s2, s13
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX8-NEXT:    s_or_b32 s3, s4, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, s13
+; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s13, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
@@ -876,14 +867,14 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_and_b32 s12, s2, 7
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX9-NEXT:    s_and_b32 s1, s1, s13
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s12
 ; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
-; GFX9-NEXT:    s_and_b32 s3, s6, s13
+; GFX9-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s9, 7
 ; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
@@ -891,26 +882,26 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 1
 ; GFX9-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX9-NEXT:    s_and_b32 s4, s7, s13
+; GFX9-NEXT:    s_and_b32 s4, s7, 0xff
 ; GFX9-NEXT:    s_or_b32 s1, s2, s1
 ; GFX9-NEXT:    s_and_b32 s2, s10, 7
 ; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, s13
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_or_b32 s2, s3, s2
 ; GFX9-NEXT:    s_and_b32 s3, s11, 7
 ; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX9-NEXT:    s_and_b32 s0, s0, s13
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX9-NEXT:    s_lshr_b32 s3, s8, s3
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s2, s13
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX9-NEXT:    s_or_b32 s3, s4, s3
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s3, s13
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -918,7 +909,6 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-LABEL: s_fshr_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX10-NEXT:    s_movk_i32 s13, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
@@ -929,9 +919,9 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX10-NEXT:    s_and_b32 s12, s2, 7
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX10-NEXT:    s_and_b32 s1, s1, s13
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s6, s6, s13
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_and_b32 s2, s9, 7
@@ -941,7 +931,7 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, s9
 ; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX10-NEXT:    s_and_b32 s6, s7, s13
+; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_or_b32 s1, s3, s2
 ; GFX10-NEXT:    s_and_b32 s2, s10, 7
@@ -956,14 +946,14 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX10-NEXT:    s_lshr_b32 s5, s8, s6
 ; GFX10-NEXT:    s_or_b32 s2, s3, s2
-; GFX10-NEXT:    s_and_b32 s1, s1, s13
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_or_b32 s3, s4, s5
-; GFX10-NEXT:    s_and_b32 s0, s0, s13
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s2, s2, s13
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s3, s13
+; GFX10-NEXT:    s_and_b32 s2, s3, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -1136,52 +1126,51 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v6
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
 ; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v12
-; GFX10-NEXT:    v_lshlrev_b16 v3, v11, v3
-; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v10
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-NEXT:    v_lshlrev_b16 v3, v10, v3
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v11
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, s4, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v13, 7, v14
-; GFX10-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
 ; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_lshrrev_b16 v6, v6, v7
-; GFX10-NEXT:    v_lshlrev_b16 v4, v11, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v10, v1
-; GFX10-NEXT:    v_lshlrev_b16 v5, v13, v5
+; GFX10-NEXT:    v_lshrrev_b16 v5, v5, v7
+; GFX10-NEXT:    v_lshlrev_b16 v4, v10, v4
+; GFX10-NEXT:    v_lshrrev_b16 v1, v11, v1
+; GFX10-NEXT:    v_lshlrev_b16 v6, v13, v6
 ; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v9
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v8
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, 8
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX10-NEXT:    v_or_b32_e32 v4, v5, v7
+; GFX10-NEXT:    v_or_b32_e32 v4, v6, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v4
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
@@ -1200,12 +1189,12 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xffffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s3
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    s_and_b32 s1, s1, s3
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -1232,12 +1221,12 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffffff
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -1264,11 +1253,11 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; GFX9-NEXT:    s_mov_b32 s3, 0xffffff
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s1, s1, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
@@ -1293,10 +1282,9 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX10-LABEL: s_fshr_i24:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffffff
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s2, s2, s3
-; GFX10-NEXT:    s_and_b32 s1, s1, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -1313,8 +1301,8 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 23, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v1, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -1425,6 +1413,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -1432,9 +1421,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    v_mul_lo_u32 v4, 0xffffffe8, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
@@ -1444,8 +1431,8 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1458,38 +1445,36 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_movk_i32 s9, 0xff
-; GFX6-NEXT:    s_mov_b32 s11, 0x80008
 ; GFX6-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_lshr_b32 s7, s0, 24
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
-; GFX6-NEXT:    s_and_b32 s10, s0, s9
-; GFX6-NEXT:    s_bfe_u32 s0, s0, s11
-; GFX6-NEXT:    s_and_b32 s1, s1, s9
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    s_and_b32 s10, s0, 0xff
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x80008
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX6-NEXT:    s_or_b32 s0, s10, s0
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
-; GFX6-NEXT:    s_and_b32 s7, s8, s9
+; GFX6-NEXT:    s_and_b32 s7, s8, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s8, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX6-NEXT:    s_and_b32 s13, s2, s9
-; GFX6-NEXT:    s_bfe_u32 s2, s2, s11
+; GFX6-NEXT:    s_and_b32 s12, s2, 0xff
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x80008
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX6-NEXT:    s_and_b32 s8, s8, s9
-; GFX6-NEXT:    s_or_b32 s2, s13, s2
+; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX6-NEXT:    s_or_b32 s2, s12, s2
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s12, s3, 8
+; GFX6-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_and_b32 s3, s3, s9
+; GFX6-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX6-NEXT:    s_and_b32 s8, s12, s9
+; GFX6-NEXT:    s_and_b32 s8, s11, 0xff
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_or_b32 s3, s10, s3
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
@@ -1498,13 +1483,13 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_or_b32 s3, s3, s8
 ; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX6-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX6-NEXT:    s_and_b32 s13, s4, s9
-; GFX6-NEXT:    s_bfe_u32 s4, s4, s11
+; GFX6-NEXT:    s_and_b32 s12, s4, 0xff
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x80008
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX6-NEXT:    s_and_b32 s8, s8, s9
+; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
-; GFX6-NEXT:    s_or_b32 s4, s13, s4
+; GFX6-NEXT:    s_or_b32 s4, s12, s4
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
@@ -1513,16 +1498,16 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    s_lshr_b32 s12, s5, 8
+; GFX6-NEXT:    s_lshr_b32 s11, s5, 8
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX6-NEXT:    s_and_b32 s5, s5, s9
+; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX6-NEXT:    s_and_b32 s8, s12, s9
+; GFX6-NEXT:    s_and_b32 s8, s11, 0xff
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    s_or_b32 s5, s10, s5
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
@@ -1534,7 +1519,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    s_and_b32 s6, s6, s9
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
@@ -1564,6 +1549,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX6-NEXT:    s_movk_i32 s9, 0xff
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s3, v1
 ; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
@@ -1589,55 +1575,54 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_movk_i32 s10, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX8-NEXT:    s_bfe_u32 s11, 8, 0x100000
-; GFX8-NEXT:    s_and_b32 s1, s1, s10
+; GFX8-NEXT:    s_bfe_u32 s10, 8, 0x100000
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s11
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s10
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX8-NEXT:    s_and_b32 s6, s6, s10
+; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX8-NEXT:    s_or_b32 s1, s8, s1
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s10
-; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
-; GFX8-NEXT:    s_and_b32 s8, s8, s10
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s10
+; GFX8-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
-; GFX8-NEXT:    s_and_b32 s6, s7, s10
-; GFX8-NEXT:    s_and_b32 s7, s9, s10
+; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX8-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s9, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s12, s2, 24
-; GFX8-NEXT:    s_and_b32 s2, s2, s10
-; GFX8-NEXT:    s_lshl_b32 s8, s8, s11
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX8-NEXT:    s_lshl_b32 s8, s8, s10
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
-; GFX8-NEXT:    s_and_b32 s8, s9, s10
+; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v1, v0
-; GFX8-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX8-NEXT:    s_lshr_b32 s12, s3, 8
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, s10
+; GFX8-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
-; GFX8-NEXT:    s_lshl_b32 s3, s3, s11
-; GFX8-NEXT:    s_and_b32 s8, s13, s10
-; GFX8-NEXT:    s_or_b32 s3, s12, s3
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s10
+; GFX8-NEXT:    s_and_b32 s8, s12, 0xff
+; GFX8-NEXT:    s_or_b32 s3, s11, s3
 ; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_or_b32 s3, s3, s8
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 8
-; GFX8-NEXT:    s_and_b32 s8, s8, s10
+; GFX8-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s9, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s12, s4, 24
-; GFX8-NEXT:    s_and_b32 s4, s4, s10
-; GFX8-NEXT:    s_lshl_b32 s8, s8, s11
+; GFX8-NEXT:    s_lshr_b32 s11, s4, 24
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX8-NEXT:    s_lshl_b32 s8, s8, s10
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
-; GFX8-NEXT:    s_and_b32 s8, s9, s10
+; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
@@ -1648,18 +1633,18 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX8-NEXT:    s_lshr_b32 s13, s5, 8
+; GFX8-NEXT:    s_lshr_b32 s12, s5, 8
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX8-NEXT:    s_and_b32 s5, s5, s10
+; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GFX8-NEXT:    s_lshl_b32 s5, s5, s11
+; GFX8-NEXT:    s_lshl_b32 s5, s5, s10
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX8-NEXT:    s_and_b32 s8, s13, s10
+; GFX8-NEXT:    s_and_b32 s8, s12, 0xff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    s_or_b32 s5, s12, s5
+; GFX8-NEXT:    s_or_b32 s5, s11, s5
 ; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
 ; GFX8-NEXT:    s_bfe_u32 s5, s5, 0x100000
@@ -1707,7 +1692,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, s10, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
@@ -1721,45 +1706,44 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
-; GFX9-NEXT:    s_movk_i32 s12, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s11, s1, 8
+; GFX9-NEXT:    s_bfe_u32 s12, 8, 0x100000
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_bfe_u32 s13, 8, 0x100000
-; GFX9-NEXT:    s_and_b32 s1, s1, s12
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX9-NEXT:    s_lshr_b32 s10, s0, 24
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s13
-; GFX9-NEXT:    s_and_b32 s7, s7, s12
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s12
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX9-NEXT:    s_or_b32 s1, s10, s1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    s_and_b32 s0, s0, s12
-; GFX9-NEXT:    s_lshl_b32 s7, s7, s13
-; GFX9-NEXT:    s_and_b32 s10, s10, s12
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s12
+; GFX9-NEXT:    s_and_b32 s10, s10, 0xff
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
-; GFX9-NEXT:    s_and_b32 s7, s9, s12
-; GFX9-NEXT:    s_and_b32 s9, s11, s12
+; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
+; GFX9-NEXT:    s_and_b32 s9, s11, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s11, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s14, s2, 24
-; GFX9-NEXT:    s_and_b32 s2, s2, s12
-; GFX9-NEXT:    s_lshl_b32 s10, s10, s13
+; GFX9-NEXT:    s_lshr_b32 s13, s2, 24
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-NEXT:    s_lshl_b32 s10, s10, s12
 ; GFX9-NEXT:    s_or_b32 s2, s2, s10
-; GFX9-NEXT:    s_and_b32 s10, s11, s12
+; GFX9-NEXT:    s_and_b32 s10, s11, 0xff
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX9-NEXT:    s_bfe_u32 s10, s10, 0x100000
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX9-NEXT:    s_lshr_b32 s14, s3, 8
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, s12
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX9-NEXT:    s_or_b32 s2, s2, s10
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s13
-; GFX9-NEXT:    s_and_b32 s10, s15, s12
-; GFX9-NEXT:    s_or_b32 s3, s14, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s12
+; GFX9-NEXT:    s_and_b32 s10, s14, 0xff
+; GFX9-NEXT:    s_or_b32 s3, s13, s3
 ; GFX9-NEXT:    s_bfe_u32 s10, s10, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
@@ -1767,25 +1751,25 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s3, s3, s10
 ; GFX9-NEXT:    s_lshr_b32 s10, s4, 8
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    s_and_b32 s10, s10, s12
+; GFX9-NEXT:    s_and_b32 s10, s10, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
-; GFX9-NEXT:    s_lshr_b32 s14, s4, 24
-; GFX9-NEXT:    s_and_b32 s4, s4, s12
-; GFX9-NEXT:    s_lshl_b32 s10, s10, s13
+; GFX9-NEXT:    s_lshr_b32 s13, s4, 24
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX9-NEXT:    s_lshl_b32 s10, s10, s12
 ; GFX9-NEXT:    s_or_b32 s4, s4, s10
-; GFX9-NEXT:    s_and_b32 s10, s11, s12
+; GFX9-NEXT:    s_and_b32 s10, s11, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s10, s10, 0x100000
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
 ; GFX9-NEXT:    s_or_b32 s4, s4, s10
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    s_lshr_b32 s15, s5, 8
-; GFX9-NEXT:    s_and_b32 s5, s5, s12
+; GFX9-NEXT:    s_lshr_b32 s14, s5, 8
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX9-NEXT:    s_lshl_b32 s5, s5, s13
-; GFX9-NEXT:    s_and_b32 s10, s15, s12
-; GFX9-NEXT:    s_or_b32 s5, s14, s5
+; GFX9-NEXT:    s_lshl_b32 s5, s5, s12
+; GFX9-NEXT:    s_and_b32 s10, s14, 0xff
+; GFX9-NEXT:    s_or_b32 s5, s13, s5
 ; GFX9-NEXT:    s_bfe_u32 s10, s10, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
@@ -1831,10 +1815,11 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
 ; GFX9-NEXT:    s_mov_b32 s6, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v3, v1
+; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    s_mov_b32 s8, 16
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_b32_e32 v3, s12, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v0, s12, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, s0, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v3
@@ -1849,122 +1834,119 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
-; GFX10-NEXT:    s_movk_i32 s9, 0xff
-; GFX10-NEXT:    s_lshr_b32 s12, s4, 8
+; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX10-NEXT:    s_bfe_u32 s10, 8, 0x100000
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    s_lshr_b32 s13, s4, 16
-; GFX10-NEXT:    s_and_b32 s12, s12, s9
-; GFX10-NEXT:    s_lshr_b32 s14, s4, 24
-; GFX10-NEXT:    s_and_b32 s4, s4, s9
-; GFX10-NEXT:    s_lshl_b32 s12, s12, s10
-; GFX10-NEXT:    s_and_b32 s13, s13, s9
-; GFX10-NEXT:    s_or_b32 s4, s4, s12
-; GFX10-NEXT:    s_bfe_u32 s12, s13, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s10
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_or_b32 s1, s8, s1
+; GFX10-NEXT:    s_lshr_b32 s8, s4, 8
+; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX10-NEXT:    s_lshr_b32 s15, s5, 8
-; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX10-NEXT:    s_lshl_b32 s12, s12, 16
+; GFX10-NEXT:    s_lshl_b32 s6, s6, s10
+; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    s_and_b32 s5, s5, s9
-; GFX10-NEXT:    s_or_b32 s4, s4, s12
-; GFX10-NEXT:    s_lshl_b32 s5, s5, s10
+; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX10-NEXT:    s_and_b32 s7, s9, 0xff
+; GFX10-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v1
-; GFX10-NEXT:    s_and_b32 s12, s15, s9
-; GFX10-NEXT:    s_or_b32 s5, s14, s5
-; GFX10-NEXT:    s_bfe_u32 s12, s12, 0x100000
-; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
-; GFX10-NEXT:    s_lshl_b32 s12, s12, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s1, 8
+; GFX10-NEXT:    s_lshr_b32 s11, s4, 24
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    s_lshl_b32 s8, s8, s10
+; GFX10-NEXT:    s_lshr_b32 s12, s5, 8
+; GFX10-NEXT:    s_or_b32 s4, s4, s8
+; GFX10-NEXT:    s_and_b32 s8, s9, 0xff
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX10-NEXT:    s_or_b32 s5, s5, s12
-; GFX10-NEXT:    s_and_b32 s1, s1, s9
-; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s10
-; GFX10-NEXT:    s_and_b32 s6, s6, s9
-; GFX10-NEXT:    s_or_b32 s1, s8, s1
+; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX10-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX10-NEXT:    s_lshl_b32 s5, s5, s10
+; GFX10-NEXT:    s_or_b32 s4, s4, s8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX10-NEXT:    s_and_b32 s0, s0, s9
+; GFX10-NEXT:    s_and_b32 s8, s12, 0xff
+; GFX10-NEXT:    s_or_b32 s5, s11, s5
+; GFX10-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX10-NEXT:    s_lshl_b32 s6, s6, s10
-; GFX10-NEXT:    s_and_b32 s8, s8, s9
-; GFX10-NEXT:    s_or_b32 s0, s0, s6
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT:    s_and_b32 s6, s7, s9
-; GFX10-NEXT:    s_and_b32 s7, s11, s9
-; GFX10-NEXT:    s_lshr_b32 s11, s2, 16
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_or_b32 s5, s5, s8
+; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX10-NEXT:    s_lshr_b32 s13, s2, 24
-; GFX10-NEXT:    s_and_b32 s2, s2, s9
-; GFX10-NEXT:    s_lshl_b32 s8, s8, s10
+; GFX10-NEXT:    s_and_b32 s9, s9, 0xff
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX10-NEXT:    s_lshr_b32 s12, s3, 8
-; GFX10-NEXT:    s_or_b32 s2, s2, s8
-; GFX10-NEXT:    s_and_b32 s8, s11, s9
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_lshl_b32 s9, s9, s10
+; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_or_b32 s2, s2, s9
 ; GFX10-NEXT:    s_bfe_u32 s4, s8, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s10
 ; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_and_b32 s3, s3, s9
-; GFX10-NEXT:    s_or_b32 s2, s2, s4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s5, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xffffff
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    s_and_b32 s5, s12, 0xff
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-NEXT:    s_or_b32 s3, s11, s3
+; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s10
-; GFX10-NEXT:    s_and_b32 s5, s12, s9
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX10-NEXT:    s_or_b32 s3, s13, s3
-; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX10-NEXT:    s_or_b32 s3, s3, s5
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    s_or_b32 s3, s3, s5
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffffff
-; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX10-NEXT:    s_lshl_b32 s5, s6, 17
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 23, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
+; GFX10-NEXT:    s_or_b32 s0, s4, s0
 ; GFX10-NEXT:    s_lshl_b32 s2, s7, 17
-; GFX10-NEXT:    v_and_b32_e32 v2, v4, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
-; GFX10-NEXT:    s_or_b32 s0, s5, s0
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v3, v0
 ; GFX10-NEXT:    s_or_b32 s0, s2, s1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    s_mov_b32 s0, 16
-; GFX10-NEXT:    v_and_b32_e32 v3, s9, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX10-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s9, v2
+; GFX10-NEXT:    v_and_or_b32 v2, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 8, v4
@@ -2156,14 +2138,13 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffffff
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v10
-; GFX10-NEXT:    v_and_b32_e32 v3, v3, v10
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
@@ -2194,12 +2175,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v4, v4, v10
-; GFX10-NEXT:    v_and_b32_e32 v6, v6, v10
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, v7, v10
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v4, v3
@@ -3032,25 +3013,24 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s5, s4
 ; GFX6-NEXT:    s_bfe_u32 s5, 1, 0x100000
-; GFX6-NEXT:    s_mov_b32 s6, 0xf0001
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s5
-; GFX6-NEXT:    s_bfe_u32 s7, s2, s6
-; GFX6-NEXT:    s_bfe_u32 s8, 14, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s6, s2, 0xf0001
+; GFX6-NEXT:    s_bfe_u32 s7, 14, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s5
-; GFX6-NEXT:    s_bfe_u32 s5, s3, s6
-; GFX6-NEXT:    s_lshr_b32 s7, s7, s8
-; GFX6-NEXT:    s_lshr_b32 s5, s5, s8
+; GFX6-NEXT:    s_bfe_u32 s5, s3, 0xf0001
+; GFX6-NEXT:    s_lshr_b32 s6, s6, s7
+; GFX6-NEXT:    s_lshr_b32 s5, s5, s7
 ; GFX6-NEXT:    s_xor_b32 s4, s4, -1
-; GFX6-NEXT:    s_or_b32 s0, s0, s7
+; GFX6-NEXT:    s_or_b32 s0, s0, s6
 ; GFX6-NEXT:    s_or_b32 s1, s1, s5
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX6-NEXT:    s_lshr_b32 s5, s4, 16
-; GFX6-NEXT:    s_and_b32 s7, s4, 15
+; GFX6-NEXT:    s_and_b32 s6, s4, 15
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
-; GFX6-NEXT:    s_bfe_u32 s2, s2, s6
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s5, 15
@@ -3058,7 +3038,7 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX6-NEXT:    s_bfe_u32 s2, s3, s6
+; GFX6-NEXT:    s_bfe_u32 s2, s3, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s3, s4, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
@@ -3111,45 +3091,43 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX9-LABEL: s_fshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000f
-; GFX9-NEXT:    s_and_b32 s4, s2, s3
-; GFX9-NEXT:    s_andn2_b32 s2, s3, s2
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x10001
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX9-NEXT:    s_and_b32 s3, s2, 0xf000f
+; GFX9-NEXT:    s_andn2_b32 s2, 0xf000f, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    s_lshl_b32 s2, s3, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s4, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s3, s4, 16
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s4
-; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    s_mov_b32 s3, 0xf000f
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x10001
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX10-NEXT:    s_and_b32 s5, s2, s3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT:    s_andn2_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_and_b32 s4, s2, 0xf000f
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
+; GFX10-NEXT:    s_andn2_b32 s2, 0xf000f, s2
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshl_b32 s2, s3, s4
+; GFX10-NEXT:    s_lshl_b32 s2, s3, s5
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX10-NEXT:    s_lshr_b32 s4, s5, 16
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s5
-; GFX10-NEXT:    s_lshr_b32 s3, s3, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s4
+; GFX10-NEXT:    s_lshr_b32 s3, s3, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -3252,10 +3230,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3326,35 +3303,34 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_mov_b32 s5, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
-; GFX6-NEXT:    s_bfe_u32 s6, s2, s5
-; GFX6-NEXT:    s_bfe_u32 s7, 14, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s5, s2, 0xf0001
+; GFX6-NEXT:    s_bfe_u32 s6, 14, 0x100000
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX6-NEXT:    s_lshr_b32 s6, s6, s7
+; GFX6-NEXT:    s_lshr_b32 s5, s5, s6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    s_or_b32 s0, s0, s6
+; GFX6-NEXT:    s_or_b32 s0, s0, s5
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT:    s_bfe_u32 s0, s2, s5
+; GFX6-NEXT:    s_bfe_u32 s0, s2, 0xf0001
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX6-NEXT:    s_bfe_u32 s4, s3, s5
+; GFX6-NEXT:    s_bfe_u32 s4, s3, 0xf0001
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX6-NEXT:    s_lshr_b32 s4, s4, s7
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX6-NEXT:    s_or_b32 s1, s1, s4
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT:    s_bfe_u32 s0, s3, s5
+; GFX6-NEXT:    s_bfe_u32 s0, s3, 0xf0001
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s1, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
@@ -3421,12 +3397,11 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX10-LABEL: v_fshr_v2i16_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
-; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xf000f, v0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x10001
-; GFX10-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX10-NEXT:    s_lshl_b32 s2, s3, 1
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xf000f, v1
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v0, s1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, s0
@@ -3514,36 +3489,34 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ;
 ; GFX9-LABEL: v_fshr_v2i16_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX9-NEXT:    s_and_b32 s3, s1, s2
-; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x10001
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX9-NEXT:    s_and_b32 s2, s1, 0xf000f
+; GFX9-NEXT:    s_andn2_b32 s1, 0xf000f, s1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
-; GFX9-NEXT:    s_lshl_b32 s1, s2, s4
+; GFX9-NEXT:    s_lshl_b32 s1, s3, s4
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s3, v0
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s2, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_v2i16_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x10001
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX10-NEXT:    s_and_b32 s4, s1, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX10-NEXT:    s_and_b32 s3, s1, 0xf000f
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-NEXT:    s_andn2_b32 s1, 0xf000f, s1
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX10-NEXT:    v_pk_lshrrev_b16 v0, s4, v0
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    v_pk_lshrrev_b16 v0, s3, v0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s2, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s2, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -3559,25 +3532,24 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s3, s2
 ; GFX6-NEXT:    s_bfe_u32 s3, 1, 0x100000
-; GFX6-NEXT:    s_mov_b32 s4, 0xf0001
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s3, v0
-; GFX6-NEXT:    s_bfe_u32 s5, s0, s4
-; GFX6-NEXT:    s_bfe_u32 s6, 14, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s4, s0, 0xf0001
+; GFX6-NEXT:    s_bfe_u32 s5, 14, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s3, v1
-; GFX6-NEXT:    s_bfe_u32 s3, s1, s4
-; GFX6-NEXT:    s_lshr_b32 s5, s5, s6
-; GFX6-NEXT:    s_lshr_b32 s3, s3, s6
+; GFX6-NEXT:    s_bfe_u32 s3, s1, 0xf0001
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s5
 ; GFX6-NEXT:    s_xor_b32 s2, s2, -1
-; GFX6-NEXT:    v_or_b32_e32 v0, s5, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX6-NEXT:    s_and_b32 s5, s2, 15
+; GFX6-NEXT:    s_and_b32 s4, s2, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
-; GFX6-NEXT:    s_bfe_u32 s0, s0, s4
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
@@ -3585,7 +3557,7 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT:    s_bfe_u32 s0, s1, s4
+; GFX6-NEXT:    s_bfe_u32 s0, s1, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
@@ -3635,32 +3607,30 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX9-LABEL: v_fshr_v2i16_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
-; GFX9-NEXT:    s_and_b32 s3, s1, s2
-; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX9-NEXT:    s_and_b32 s2, s1, 0xf000f
+; GFX9-NEXT:    s_andn2_b32 s1, 0xf000f, s1
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s1, v0
 ; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s0, s0, s3
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_v2i16_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    s_and_b32 s3, s1, s2
-; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
-; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_and_b32 s2, s1, 0xf000f
+; GFX10-NEXT:    s_andn2_b32 s1, 0xf000f, s1
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s1, v0
-; GFX10-NEXT:    s_lshr_b32 s0, s0, s3
-; GFX10-NEXT:    s_lshr_b32 s1, s2, s4
+; GFX10-NEXT:    s_lshr_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s3, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -3686,41 +3656,39 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s12, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX6-NEXT:    s_and_b32 s8, s8, s12
+; GFX6-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GFX6-NEXT:    s_or_b32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s9, s11, 16
-; GFX6-NEXT:    s_and_b32 s10, s10, s12
-; GFX6-NEXT:    s_mov_b32 s11, 0xf0001
+; GFX6-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GFX6-NEXT:    s_or_b32 s9, s9, s10
 ; GFX6-NEXT:    s_bfe_u32 s10, 1, 0x100000
-; GFX6-NEXT:    s_bfe_u32 s12, s4, s11
-; GFX6-NEXT:    s_bfe_u32 s13, 14, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s11, s4, 0xf0001
+; GFX6-NEXT:    s_bfe_u32 s12, 14, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s10
-; GFX6-NEXT:    s_lshr_b32 s12, s12, s13
-; GFX6-NEXT:    s_or_b32 s0, s0, s12
-; GFX6-NEXT:    s_bfe_u32 s12, s5, s11
+; GFX6-NEXT:    s_lshr_b32 s11, s11, s12
+; GFX6-NEXT:    s_or_b32 s0, s0, s11
+; GFX6-NEXT:    s_bfe_u32 s11, s5, 0xf0001
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s10
-; GFX6-NEXT:    s_lshr_b32 s12, s12, s13
+; GFX6-NEXT:    s_lshr_b32 s11, s11, s12
 ; GFX6-NEXT:    s_xor_b32 s8, s8, -1
-; GFX6-NEXT:    s_or_b32 s1, s1, s12
+; GFX6-NEXT:    s_or_b32 s1, s1, s11
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX6-NEXT:    s_lshr_b32 s12, s8, 16
-; GFX6-NEXT:    s_and_b32 s14, s8, 15
+; GFX6-NEXT:    s_lshr_b32 s11, s8, 16
+; GFX6-NEXT:    s_and_b32 s13, s8, 15
 ; GFX6-NEXT:    s_andn2_b32 s8, 15, s8
-; GFX6-NEXT:    s_bfe_u32 s14, s14, 0x100000
-; GFX6-NEXT:    s_bfe_u32 s4, s4, s11
+; GFX6-NEXT:    s_bfe_u32 s13, s13, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s14
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s13
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s4
-; GFX6-NEXT:    s_and_b32 s4, s12, 15
+; GFX6-NEXT:    s_and_b32 s4, s11, 15
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX6-NEXT:    s_andn2_b32 s8, 15, s12
+; GFX6-NEXT:    s_andn2_b32 s8, 15, s11
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX6-NEXT:    s_bfe_u32 s4, s5, s11
+; GFX6-NEXT:    s_bfe_u32 s4, s5, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s5, s8, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_or_b32 s1, s1, s4
@@ -3729,12 +3697,12 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, s10
-; GFX6-NEXT:    s_bfe_u32 s2, s6, s11
-; GFX6-NEXT:    s_lshr_b32 s2, s2, s13
+; GFX6-NEXT:    s_bfe_u32 s2, s6, 0xf0001
+; GFX6-NEXT:    s_lshr_b32 s2, s2, s12
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, s10
-; GFX6-NEXT:    s_bfe_u32 s3, s7, s11
-; GFX6-NEXT:    s_lshr_b32 s3, s3, s13
+; GFX6-NEXT:    s_bfe_u32 s3, s7, 0xf0001
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s12
 ; GFX6-NEXT:    s_xor_b32 s5, s9, -1
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 1
@@ -3743,7 +3711,7 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_and_b32 s7, s5, 15
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s5
 ; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
-; GFX6-NEXT:    s_bfe_u32 s3, s3, s11
+; GFX6-NEXT:    s_bfe_u32 s3, s3, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s7
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s5
@@ -3752,7 +3720,7 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s6
 ; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX6-NEXT:    s_bfe_u32 s3, s4, s11
+; GFX6-NEXT:    s_bfe_u32 s3, s4, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s4, s5, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
@@ -3840,31 +3808,28 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ;
 ; GFX9-LABEL: s_fshr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, 0x10001
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s8
-; GFX9-NEXT:    s_lshl_b32 s9, s9, 1
-; GFX9-NEXT:    s_and_b32 s7, s4, s6
-; GFX9-NEXT:    s_andn2_b32 s4, s6, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x10001
+; GFX9-NEXT:    s_lshl_b32 s7, s7, 1
+; GFX9-NEXT:    s_and_b32 s6, s4, 0xf000f
+; GFX9-NEXT:    s_andn2_b32 s4, 0xf000f, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX9-NEXT:    s_lshl_b32 s4, s9, s10
-; GFX9-NEXT:    s_mov_b32 s9, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s4, s7, s8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    s_and_b32 s2, s2, s9
-; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
-; GFX9-NEXT:    s_lshr_b32 s2, s2, s7
-; GFX9-NEXT:    s_lshr_b32 s4, s4, s10
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s6
+; GFX9-NEXT:    s_lshr_b32 s4, s4, s7
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s5, s6
-; GFX9-NEXT:    s_andn2_b32 s4, s6, s5
+; GFX9-NEXT:    s_and_b32 s2, s5, 0xf000f
+; GFX9-NEXT:    s_andn2_b32 s4, 0xf000f, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s8
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x10001
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
@@ -3873,7 +3838,7 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, s9
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s4, s5
@@ -3883,41 +3848,38 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ;
 ; GFX10-LABEL: s_fshr_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s7, 0x10001
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX10-NEXT:    s_mov_b32 s6, 0xf000f
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 1
-; GFX10-NEXT:    s_and_b32 s9, s4, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s8
-; GFX10-NEXT:    s_andn2_b32 s4, s6, s4
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x10001
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 1
+; GFX10-NEXT:    s_and_b32 s7, s4, 0xf000f
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX10-NEXT:    s_andn2_b32 s4, 0xf000f, s4
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX10-NEXT:    s_lshl_b32 s4, s8, s10
-; GFX10-NEXT:    s_mov_b32 s8, 0xffff
+; GFX10-NEXT:    s_lshl_b32 s4, s6, s8
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s7
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s8, s7, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x10001
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX10-NEXT:    s_and_b32 s7, s5, s6
+; GFX10-NEXT:    s_lshr_b32 s2, s2, s7
+; GFX10-NEXT:    s_lshr_b32 s6, s6, s8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX10-NEXT:    s_andn2_b32 s4, s6, s5
+; GFX10-NEXT:    s_andn2_b32 s4, 0xf000f, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
+; GFX10-NEXT:    s_and_b32 s6, s5, 0xf000f
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s2, s8
-; GFX10-NEXT:    s_lshr_b32 s11, s9, 16
+; GFX10-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX10-NEXT:    s_lshl_b32 s4, s5, s6
+; GFX10-NEXT:    s_lshl_b32 s4, s5, s7
 ; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    s_and_b32 s3, s3, s8
-; GFX10-NEXT:    s_lshr_b32 s6, s7, 16
-; GFX10-NEXT:    s_lshr_b32 s2, s2, s9
-; GFX10-NEXT:    s_lshr_b32 s9, s10, s11
-; GFX10-NEXT:    s_lshr_b32 s3, s3, s7
-; GFX10-NEXT:    s_lshr_b32 s5, s5, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s9
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s7, s6, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s3, s6
+; GFX10-NEXT:    s_lshr_b32 s5, s5, s7
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
@@ -4090,16 +4052,15 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
-; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xf000f, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xf000f, v6
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xf000f, v7
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v5, v3
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v7, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -5008,35 +4969,34 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v8
-; GFX10-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 31, v1
-; GFX10-NEXT:    v_and_b32_e32 v19, s4, v8
-; GFX10-NEXT:    v_and_b32_e32 v18, s4, v9
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX10-NEXT:    v_and_b32_e32 v18, 0x7f, v9
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v18
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v19
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b64 v[12:13], v19, v[4:5]
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[14:15], v18, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v21, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
+; GFX10-NEXT:    v_or_b32_e32 v12, v12, v16
 ; GFX10-NEXT:    v_or_b32_e32 v10, v10, v8
 ; GFX10-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v20, v[6:7]
-; GFX10-NEXT:    v_or_b32_e32 v12, v12, v16
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v21, v[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v13, v13, v17
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v14, vcc_lo
@@ -5219,36 +5179,35 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-LABEL: v_fshr_i128_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
-; GFX10-NEXT:    s_movk_i32 s10, 0x7f
+; GFX10-NEXT:    v_and_b32_e32 v13, 0x7f, v0
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:    v_and_b32_e32 v13, s10, v0
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    v_and_b32_e32 v12, s10, v1
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
 ; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[8:9]
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v13, s[4:5]
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v12, s[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v8
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v1
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX10-NEXT:    v_or_b32_e32 v4, v4, v8
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
@@ -6464,11 +6423,10 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v17, -1, v16
-; GFX10-NEXT:    s_movk_i32 s5, 0x7f
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    v_and_b32_e32 v26, s5, v16
+; GFX10-NEXT:    v_and_b32_e32 v26, 0x7f, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT:    v_and_b32_e32 v25, s5, v17
+; GFX10-NEXT:    v_and_b32_e32 v25, 0x7f, v17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v27, 64, v26
@@ -6502,14 +6460,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s4
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v26, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v25, s5, v16
+; GFX10-NEXT:    v_and_b32_e32 v25, 0x7f, v16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v0, v23, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v25
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX10-NEXT:    v_and_b32_e32 v23, s5, v20
+; GFX10-NEXT:    v_and_b32_e32 v23, 0x7f, v20
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, v3, s4
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v9, v[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 4588b84e664aa..17fcb5488684c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -9,14 +9,13 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_and_b32 s1, s5, 1
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_and_b32 s3, s4, s2
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
-; GFX9-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX9-NEXT:    s_or_b32 s0, s0, s3
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
@@ -27,14 +26,13 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX8-NEXT:    s_and_b32 s1, s5, 1
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_and_b32 s3, s4, s2
-; GFX8-NEXT:    s_lshl_b32 s3, s3, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX8-NEXT:    s_or_b32 s0, s0, s3
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -45,14 +43,13 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX7-NEXT:    s_and_b32 s1, s5, 1
-; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    s_and_b32 s3, s4, s2
-; GFX7-NEXT:    s_lshl_b32 s3, s3, s1
-; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX7-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX7-NEXT:    s_or_b32 s0, s0, s3
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s0
@@ -63,15 +60,14 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    s_and_b32 s1, s5, 1
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX10-NEXT:    s_and_b32 s3, s4, s2
-; GFX10-NEXT:    s_lshl_b32 s2, s2, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_lshl_b32 s3, 0xffff, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s3
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
@@ -87,13 +83,12 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v2, v[0:1], off
 ; GFX9-NEXT:    s_and_b32 s0, s3, 1
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX9-NEXT:    s_and_b32 s2, s2, s1
-; GFX9-NEXT:    s_lshl_b32 s2, s2, s0
-; GFX9-NEXT:    s_lshl_b32 s0, s1, s0
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s0
+; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_not_b32 s0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -104,36 +99,34 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v2i16_s_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_and_b32 s1, s3, 1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_and_b32 s2, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s0, s3, 1
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s0
+; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_not_b32 s0, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v2, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, s1, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_s_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-NEXT:    s_and_b32 s1, s3, 1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    s_and_b32 s2, s2, s0
-; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s0, s3, 1
+; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
+; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
-; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, s1, v2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -141,13 +134,12 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v2, v[0:1], off
 ; GFX10-NEXT:    s_and_b32 s0, s3, 1
-; GFX10-NEXT:    s_mov_b32 s1, 0xffff
+; GFX10-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX10-NEXT:    s_and_b32 s2, s2, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s2, s0
-; GFX10-NEXT:    s_not_b32 s1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s0
+; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
+; GFX10-NEXT:    s_not_b32 s1, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, s1, s0
@@ -165,9 +157,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_and_b32 s1, s4, 1
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v2, s2, v0
-; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
@@ -182,9 +173,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX8-NEXT:    s_and_b32 s1, s4, 1
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
@@ -198,11 +188,10 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX7-NEXT:    s_and_b32 s1, s4, 1
-; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v0
-; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
@@ -215,11 +204,10 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    s_and_b32 s1, s4, 1
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX10-NEXT:    v_and_b32_e32 v2, s2, v0
-; GFX10-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
@@ -237,9 +225,9 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_and_b32 s2, s4, s1
+; GFX9-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v0
@@ -254,9 +242,9 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_and_b32 s2, s4, s1
+; GFX8-NEXT:    s_mov_b32 s1, 0xffff
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
@@ -272,11 +260,10 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT:    s_and_b32 s2, s4, s1
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s2, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s1, v0
+; GFX7-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
@@ -290,10 +277,9 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_mov_b32 s1, 0xffff
+; GFX10-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s1
-; GFX10-NEXT:    s_and_b32 s1, s4, s1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -364,9 +350,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -386,9 +371,9 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v0
@@ -402,9 +387,9 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v2i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
@@ -420,12 +405,11 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v2i16_s_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
@@ -439,10 +423,9 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
+; GFX10-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s0
-; GFX10-NEXT:    s_and_b32 s0, s2, s0
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
 ; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -463,9 +446,8 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    s_and_b32 s0, s2, 1
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    s_lshl_b32 s0, s1, s0
+; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_not_b32 s0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -477,11 +459,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v2i16_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_and_b32 s1, s2, 1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_and_b32 s0, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_not_b32 s0, s0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -495,13 +476,12 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v2i16_v_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-NEXT:    s_and_b32 s1, s2, 1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    v_and_b32_e32 v1, s0, v2
-; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s0, s2, 1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s0, v1
+; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
@@ -514,12 +494,11 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    s_and_b32 s0, s2, 1
-; GFX10-NEXT:    s_mov_b32 s1, 0xffff
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX10-NEXT:    s_not_b32 s0, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_or_b32 v2, v3, s0, v2
@@ -587,9 +566,8 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s0
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -665,22 +643,21 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v4i16_s_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s1, s3, 1
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_and_b32 s2, s2, s0
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    s_and_b32 s1, s3, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX9-NEXT:    s_not_b32 s1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s0, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -689,22 +666,21 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v4i16_s_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_lshr_b32 s1, s3, 1
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX8-NEXT:    s_and_b32 s2, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX8-NEXT:    s_not_b32 s0, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, s3, 1
+; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT:    s_not_b32 s1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, s1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, s2, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -713,22 +689,21 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v4i16_s_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_lshr_b32 s1, s3, 1
-; GFX7-NEXT:    s_and_b32 s3, s3, 1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
-; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX7-NEXT:    s_and_b32 s2, s2, s0
-; GFX7-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX7-NEXT:    s_not_b32 s0, s0
-; GFX7-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX7-NEXT:    s_and_b32 s1, s3, 1
+; GFX7-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX7-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_not_b32 s1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s1, v4
 ; GFX7-NEXT:    v_or_b32_e32 v4, s2, v4
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -737,23 +712,22 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX10-LABEL: insertelement_v_v4i16_s_s:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_lshr_b32 s1, s3, 1
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s1, 1
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX10-NEXT:    s_and_b32 s2, s2, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX10-NEXT:    s_not_b32 s0, s0
+; GFX10-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX10-NEXT:    s_and_b32 s1, s3, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT:    s_lshl_b32 s3, 0xffff, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX10-NEXT:    s_not_b32 s2, s3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s0, s2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s1, 0
+; GFX10-NEXT:    v_and_or_b32 v4, v2, s2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr
@@ -768,18 +742,17 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX9-NEXT:    s_lshr_b32 s2, s4, 1
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX9-NEXT:    s_and_b32 s4, s4, 1
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 4
-; GFX9-NEXT:    s_lshl_b32 s5, s5, s4
+; GFX9-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX9-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -794,21 +767,20 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX8-NEXT:    s_lshr_b32 s2, s4, 1
 ; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX8-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX8-NEXT:    s_and_b32 s4, s4, 1
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_andn2_b32 s3, s3, s4
 ; GFX8-NEXT:    v_or_b32_e32 v4, s3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -820,18 +792,17 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    s_lshr_b32 s2, s4, 1
 ; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX7-NEXT:    s_mov_b32 s5, 0xffff
-; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX7-NEXT:    s_and_b32 s4, s4, 1
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX7-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s4
 ; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
 ; GFX7-NEXT:    v_or_b32_e32 v4, s3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
@@ -845,9 +816,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_lshr_b32 s2, s4, 1
-; GFX10-NEXT:    s_mov_b32 s5, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX10-NEXT:    v_and_b32_e32 v2, s5, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cselect_b32 s3, s1, s0
@@ -855,7 +825,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_lshl_b32 s5, s5, s4
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
@@ -877,13 +847,13 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_and_b32 s3, s4, s2
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s3, s4, 0xffff
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
@@ -904,13 +874,13 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_and_b32 s3, s4, s2
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_and_b32 s3, s4, 0xffff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
@@ -932,16 +902,15 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT:    s_and_b32 s3, s4, s2
+; GFX7-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX7-NEXT:    v_lshl_b32_e32 v3, s3, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v3, s2, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_or_b32_e32 v4, v0, v3
@@ -960,12 +929,11 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
-; GFX10-NEXT:    s_and_b32 s3, s4, s2
+; GFX10-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s2
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s2
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s1
@@ -1073,10 +1041,9 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1102,10 +1069,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v4i16_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
@@ -1125,10 +1092,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v4i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
@@ -1149,13 +1116,12 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v4i16_s_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v6, s1, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v6, s0, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
 ; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0
@@ -1175,11 +1141,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
+; GFX10-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
-; GFX10-NEXT:    s_and_b32 s0, s2, s0
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v3, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v5
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v4
@@ -1202,20 +1167,19 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v4i16_v_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_lshr_b32 s1, s2, 1
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX9-NEXT:    s_not_b32 s1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v5, s0, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX9-NEXT:    v_and_or_b32 v2, v5, s1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
@@ -1224,22 +1188,21 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v4i16_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_lshr_b32 s1, s2, 1
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s2
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    s_not_b32 s1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, s1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
@@ -1248,22 +1211,21 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v4i16_v_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    s_lshr_b32 s1, s2, 1
-; GFX7-NEXT:    s_and_b32 s2, s2, 1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
-; GFX7-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX7-NEXT:    s_not_b32 s0, s0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
+; GFX7-NEXT:    s_and_b32 s1, s2, 1
+; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v2
+; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_not_b32 s1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, s1, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
@@ -1272,18 +1234,17 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX10-LABEL: insertelement_v_v4i16_v_s:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_lshr_b32 s1, s2, 1
 ; GFX10-NEXT:    s_and_b32 s0, s2, 1
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s1, 1
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
+; GFX10-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX10-NEXT:    s_not_b32 s0, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_and_or_b32 v4, v3, s0, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s2, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -1371,12 +1332,11 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v6
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
@@ -1399,8 +1359,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX9-NEXT:    s_lshr_b32 s6, s5, 1
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
-; GFX9-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cselect_b32 s7, s1, s0
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
@@ -1409,9 +1369,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_cselect_b32 s7, s3, s7
 ; GFX9-NEXT:    s_and_b32 s5, s5, 1
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 4
-; GFX9-NEXT:    s_and_b32 s4, s4, s8
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX9-NEXT:    s_lshl_b32 s5, s8, s5
+; GFX9-NEXT:    s_lshl_b32 s5, 0xffff, s5
 ; GFX9-NEXT:    s_andn2_b32 s5, s7, s5
 ; GFX9-NEXT:    s_or_b32 s4, s5, s4
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
@@ -1423,7 +1383,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
@@ -1435,8 +1394,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX8-NEXT:    s_lshr_b32 s6, s5, 1
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
-; GFX8-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cselect_b32 s7, s1, s0
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
@@ -1445,9 +1404,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_cselect_b32 s7, s3, s7
 ; GFX8-NEXT:    s_and_b32 s5, s5, 1
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 4
-; GFX8-NEXT:    s_and_b32 s4, s4, s8
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX8-NEXT:    s_lshl_b32 s5, s8, s5
+; GFX8-NEXT:    s_lshl_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_andn2_b32 s5, s7, s5
 ; GFX8-NEXT:    s_or_b32 s4, s5, s4
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
@@ -1459,7 +1418,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
@@ -1471,7 +1429,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX7-NEXT:    s_lshr_b32 s6, s5, 1
 ; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
-; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_cselect_b32 s7, s1, s0
 ; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
@@ -1480,9 +1437,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_cselect_b32 s7, s3, s7
 ; GFX7-NEXT:    s_and_b32 s5, s5, 1
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 4
-; GFX7-NEXT:    s_and_b32 s4, s4, s8
+; GFX7-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX7-NEXT:    s_lshl_b32 s5, s8, s5
+; GFX7-NEXT:    s_lshl_b32 s5, 0xffff, s5
 ; GFX7-NEXT:    s_andn2_b32 s5, s7, s5
 ; GFX7-NEXT:    s_or_b32 s4, s5, s4
 ; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
@@ -1507,9 +1464,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX10-NEXT:    s_lshr_b32 s6, s5, 1
-; GFX10-NEXT:    s_mov_b32 s8, 0xffff
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cselect_b32 s7, s1, s0
@@ -1518,9 +1474,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX10-NEXT:    s_cselect_b32 s7, s3, s7
 ; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    s_and_b32 s4, s4, s8
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 4
-; GFX10-NEXT:    s_lshl_b32 s8, s8, s5
+; GFX10-NEXT:    s_lshl_b32 s8, 0xffff, s5
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, s5
 ; GFX10-NEXT:    s_andn2_b32 s5, s7, s8
 ; GFX10-NEXT:    s_or_b32 s4, s5, s4
@@ -1548,17 +1504,16 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v8i16_s_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_and_b32 s1, s3, 1
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s3, 1
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 1
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_and_b32 s2, s2, s0
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s0
+; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX9-NEXT:    s_not_b32 s5, s0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
@@ -1578,14 +1533,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v8i16_s_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_and_b32 s1, s3, 1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s3, 1
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_and_b32 s2, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    s_lshl_b32 s5, s1, s0
+; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT:    s_lshl_b32 s5, s2, s1
 ; GFX8-NEXT:    s_not_b32 s6, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
@@ -1611,14 +1565,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_and_b32 s1, s3, 1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_and_b32 s0, s3, 1
 ; GFX7-NEXT:    s_lshr_b32 s4, s3, 1
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    s_and_b32 s2, s2, s0
-; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX7-NEXT:    s_lshl_b32 s5, s1, s0
+; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT:    s_lshl_b32 s5, s2, s1
 ; GFX7-NEXT:    s_not_b32 s6, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
@@ -1646,9 +1599,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s4, 2
 ; GFX10-NEXT:    s_lshl_b32 s3, s1, 4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s4, 3
-; GFX10-NEXT:    s_mov_b32 s5, 0xffff
-; GFX10-NEXT:    s_and_b32 s2, s2, s5
-; GFX10-NEXT:    s_lshl_b32 s5, s5, s3
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s3
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX10-NEXT:    s_not_b32 s3, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1677,8 +1629,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX9-NEXT:    s_lshr_b32 s5, s4, 1
 ; GFX9-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX9-NEXT:    s_mov_b32 s7, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, s7, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX9-NEXT:    s_cmp_eq_u32 s5, 2
@@ -1687,12 +1639,11 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_cselect_b32 s6, s3, s6
 ; GFX9-NEXT:    s_and_b32 s4, s4, 1
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 4
-; GFX9-NEXT:    s_lshl_b32 s7, s7, s4
+; GFX9-NEXT:    s_lshl_b32 s7, 0xffff, s4
 ; GFX9-NEXT:    s_andn2_b32 s6, s6, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    v_lshl_or_b32 v6, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
@@ -1713,8 +1664,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX8-NEXT:    s_lshr_b32 s5, s4, 1
 ; GFX8-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX8-NEXT:    s_mov_b32 s7, 0xffff
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX8-NEXT:    s_cmp_eq_u32 s5, 2
@@ -1724,7 +1675,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s4, s4, 1
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    s_lshl_b32 s4, s7, s4
+; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_andn2_b32 s4, s6, s4
 ; GFX8-NEXT:    v_or_b32_e32 v6, s4, v0
@@ -1737,7 +1688,6 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
@@ -1749,8 +1699,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX7-NEXT:    s_lshr_b32 s5, s4, 1
 ; GFX7-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX7-NEXT:    s_mov_b32 s7, 0xffff
-; GFX7-NEXT:    v_and_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX7-NEXT:    s_cmp_eq_u32 s5, 2
@@ -1760,11 +1710,10 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s4, s4, 1
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX7-NEXT:    s_lshl_b32 s4, s7, s4
+; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s4
 ; GFX7-NEXT:    s_andn2_b32 s4, s6, s4
 ; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
@@ -1785,9 +1734,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX10-NEXT:    s_lshr_b32 s5, s4, 1
-; GFX10-NEXT:    s_mov_b32 s7, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX10-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX10-NEXT:    v_and_b32_e32 v4, s7, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cselect_b32 s6, s1, s0
@@ -1801,7 +1749,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    s_lshl_b32 s7, s7, s4
+; GFX10-NEXT:    s_lshl_b32 s7, 0xffff, s4
 ; GFX10-NEXT:    s_andn2_b32 s6, s6, s7
 ; GFX10-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
@@ -1828,15 +1776,15 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_and_b32 s4, s4, s5
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
@@ -1865,15 +1813,15 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_mov_b32 s5, 0xffff
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_and_b32 s4, s4, s5
+; GFX8-NEXT:    s_mov_b32 s5, 0xffff
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
@@ -1903,20 +1851,19 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    s_mov_b32 s5, 0xffff
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT:    s_and_b32 s4, s4, s5
+; GFX7-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -1939,15 +1886,14 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX10-LABEL: insertelement_s_v8i16_s_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v0
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    s_and_b32 s1, s4, s0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
+; GFX10-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
-; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v1, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v2
@@ -2091,15 +2037,15 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX10-LABEL: insertelement_s_v8i16_v_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
+; GFX10-NEXT:    s_mov_b32 null, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2130,10 +2076,10 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v8i16_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
@@ -2159,10 +2105,10 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v8i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
@@ -2192,15 +2138,14 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
@@ -2221,15 +2166,14 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-LABEL: insertelement_v_v8i16_s_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    s_and_b32 s1, s2, s0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX10-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v0, s0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
@@ -2256,13 +2200,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v8i16_v_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT:    s_and_b32 s1, s2, 1
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s2, 1
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 1
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    s_not_b32 s5, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
@@ -2284,13 +2227,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v8i16_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    s_and_b32 s1, s2, 1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s2, 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s2, 1
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    s_not_b32 s5, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
@@ -2317,14 +2259,13 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_and_b32 s1, s2, 1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_and_b32 s0, s2, 1
 ; GFX7-NEXT:    s_lshr_b32 s4, s2, 1
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    v_and_b32_e32 v0, s0, v2
-; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
 ; GFX7-NEXT:    s_not_b32 s5, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
@@ -2352,9 +2293,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, 2
 ; GFX10-NEXT:    s_lshl_b32 s2, s1, 4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s3, 3
-; GFX10-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    s_lshl_b32 s2, s4, s2
+; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX10-NEXT:    s_not_b32 s2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
@@ -2469,15 +2409,14 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-LABEL: insertelement_v_v8i16_v_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
-; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
@@ -2505,8 +2444,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX9-NEXT:    s_lshr_b32 s7, s5, 1
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cselect_b32 s0, s9, s8
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
@@ -2523,11 +2462,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    s_cselect_b32 s0, s15, s0
 ; GFX9-NEXT:    s_and_b32 s1, s5, 1
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_and_b32 s3, s4, s2
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
-; GFX9-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX9-NEXT:    s_or_b32 s16, s0, s3
+; GFX9-NEXT:    s_or_b32 s16, s0, s2
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX9-NEXT:    s_cselect_b32 s0, s16, s8
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
@@ -2544,7 +2483,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_cselect_b32 s6, s16, s14
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
@@ -2564,8 +2502,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX8-NEXT:    s_lshr_b32 s7, s5, 1
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cselect_b32 s0, s9, s8
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 2
@@ -2582,11 +2520,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    s_cselect_b32 s0, s15, s0
 ; GFX8-NEXT:    s_and_b32 s1, s5, 1
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_and_b32 s3, s4, s2
-; GFX8-NEXT:    s_lshl_b32 s3, s3, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX8-NEXT:    s_or_b32 s16, s0, s3
+; GFX8-NEXT:    s_or_b32 s16, s0, s2
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX8-NEXT:    s_cselect_b32 s0, s16, s8
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 1
@@ -2603,7 +2541,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_cselect_b32 s6, s16, s14
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 7
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
@@ -2623,7 +2560,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX7-NEXT:    s_lshr_b32 s7, s5, 1
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_cselect_b32 s0, s9, s8
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 2
@@ -2640,11 +2576,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    s_cselect_b32 s0, s15, s0
 ; GFX7-NEXT:    s_and_b32 s1, s5, 1
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    s_and_b32 s3, s4, s2
-; GFX7-NEXT:    s_lshl_b32 s3, s3, s1
-; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX7-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX7-NEXT:    s_or_b32 s16, s0, s3
+; GFX7-NEXT:    s_or_b32 s16, s0, s2
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX7-NEXT:    s_cselect_b32 s0, s16, s8
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 1
@@ -2681,9 +2617,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX10-NEXT:    s_lshr_b32 s7, s5, 1
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
-; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
@@ -2702,11 +2637,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX10-NEXT:    s_cmp_eq_u32 s7, 7
 ; GFX10-NEXT:    s_cselect_b32 s0, s15, s0
 ; GFX10-NEXT:    s_and_b32 s1, s5, 1
-; GFX10-NEXT:    s_and_b32 s3, s4, s2
+; GFX10-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX10-NEXT:    s_lshl_b32 s2, s2, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshl_b32 s3, 0xffff, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s3
 ; GFX10-NEXT:    s_or_b32 s16, s0, s1
 ; GFX10-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX10-NEXT:    s_cselect_b32 s0, s16, s8
@@ -2746,17 +2681,16 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX9-NEXT:    s_and_b32 s1, s3, 1
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s3, 1
 ; GFX9-NEXT:    s_lshr_b32 s12, s3, 1
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_and_b32 s2, s2, s0
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s0
+; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
-; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX9-NEXT:    s_not_b32 s13, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
@@ -2795,14 +2729,13 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
-; GFX8-NEXT:    s_and_b32 s1, s3, 1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s12, s3, 1
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_and_b32 s2, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    s_lshl_b32 s13, s1, s0
+; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
-; GFX8-NEXT:    s_lshl_b32 s13, s2, s1
 ; GFX8-NEXT:    s_not_b32 s14, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
@@ -2845,14 +2778,13 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16
-; GFX7-NEXT:    s_and_b32 s1, s3, 1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_and_b32 s0, s3, 1
 ; GFX7-NEXT:    s_lshr_b32 s12, s3, 1
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    s_and_b32 s2, s2, s0
-; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX7-NEXT:    s_lshl_b32 s13, s1, s0
+; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
-; GFX7-NEXT:    s_lshl_b32 s13, s2, s1
 ; GFX7-NEXT:    s_not_b32 s14, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
@@ -2892,22 +2824,21 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
 ; GFX10-NEXT:    s_lshr_b32 s7, s3, 1
-; GFX10-NEXT:    s_mov_b32 s8, 0xffff
+; GFX10-NEXT:    s_and_b32 s8, s2, 0xffff
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s7, 1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, 2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s7, 3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, s7, 4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, s7, 5
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, s7, 6
-; GFX10-NEXT:    s_and_b32 s9, s2, s8
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s7, 7
 ; GFX10-NEXT:    s_and_b32 s3, s3, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
-; GFX10-NEXT:    s_lshl_b32 s8, s8, s3
-; GFX10-NEXT:    s_lshl_b32 s3, s9, s3
-; GFX10-NEXT:    s_not_b32 s8, s8
+; GFX10-NEXT:    s_lshl_b32 s9, 0xffff, s3
+; GFX10-NEXT:    s_lshl_b32 s3, s8, s3
+; GFX10-NEXT:    s_not_b32 s8, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v12, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
@@ -2944,8 +2875,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX9-NEXT:    s_lshr_b32 s2, s4, 1
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX9-NEXT:    s_mov_b32 s3, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cselect_b32 s0, s9, s8
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 2
@@ -2962,12 +2893,11 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    s_cselect_b32 s0, s15, s0
 ; GFX9-NEXT:    s_and_b32 s1, s4, 1
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
+; GFX9-NEXT:    s_lshl_b32 s3, 0xffff, s1
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_lshl_or_b32 v8, v0, s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
@@ -3003,8 +2933,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX8-NEXT:    s_lshr_b32 s2, s4, 1
 ; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cselect_b32 s0, s9, s8
 ; GFX8-NEXT:    s_cmp_eq_u32 s2, 2
@@ -3022,7 +2952,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    s_and_b32 s1, s4, 1
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX8-NEXT:    v_or_b32_e32 v8, s0, v0
@@ -3051,7 +2981,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
@@ -3062,8 +2991,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX7-NEXT:    s_lshr_b32 s2, s4, 1
 ; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX7-NEXT:    s_mov_b32 s3, 0xffff
-; GFX7-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_cselect_b32 s0, s9, s8
 ; GFX7-NEXT:    s_cmp_eq_u32 s2, 2
@@ -3081,11 +3010,10 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    s_and_b32 s1, s4, 1
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
-; GFX7-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX7-NEXT:    v_or_b32_e32 v8, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
@@ -3120,9 +3048,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX10-NEXT:    s_lshr_b32 s0, s4, 1
-; GFX10-NEXT:    s_mov_b32 s3, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 1
-; GFX10-NEXT:    v_and_b32_e32 v8, s3, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
@@ -3150,7 +3077,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s14
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s15
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s2
+; GFX10-NEXT:    s_lshl_b32 s3, 0xffff, s2
 ; GFX10-NEXT:    s_andn2_b32 s1, s1, s3
 ; GFX10-NEXT:    v_lshl_or_b32 v12, v8, s2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
@@ -3201,12 +3128,12 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s22
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_and_b32 s4, s4, s5
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s23
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
@@ -3261,12 +3188,12 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s22
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_and_b32 s4, s4, s5
+; GFX8-NEXT:    s_mov_b32 s5, 0xffff
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s23
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
@@ -3322,17 +3249,16 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s22
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT:    s_and_b32 s4, s4, s5
+; GFX7-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX7-NEXT:    v_mov_b32_e32 v9, s23
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -3367,8 +3293,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_mov_b32 s5, 0xffff
-; GFX10-NEXT:    s_and_b32 s6, s4, s5
+; GFX10-NEXT:    s_and_b32 s5, s4, 0xffff
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
@@ -3376,10 +3301,10 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
-; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
-; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, s6
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, s5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v2
@@ -3606,21 +3531,21 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
+; GFX10-NEXT:    s_mov_b32 null, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 4, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, s1
@@ -3663,10 +3588,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
@@ -3711,10 +3636,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
@@ -3761,19 +3686,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
@@ -3809,9 +3733,9 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    s_mov_b32 s5, 0xffff
+; GFX10-NEXT:    s_and_b32 s5, s2, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 16
-; GFX10-NEXT:    s_and_b32 s6, s2, s5
+; GFX10-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
@@ -3819,11 +3743,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 5, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 6, v0
-; GFX10-NEXT:    v_mov_b32_e32 v14, 0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v11, v2, s5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e64 v11, v2, 0xffff
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, s5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v11
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -3859,13 +3782,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GFX9-NEXT:    s_and_b32 s1, s2, 1
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s2, 1
 ; GFX9-NEXT:    s_lshr_b32 s12, s2, 1
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    s_not_b32 s13, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
@@ -3906,13 +3828,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
-; GFX8-NEXT:    s_and_b32 s1, s2, 1
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s2, 1
 ; GFX8-NEXT:    s_lshr_b32 s12, s2, 1
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    s_not_b32 s13, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
@@ -3956,14 +3877,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
-; GFX7-NEXT:    s_and_b32 s1, s2, 1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_and_b32 s0, s2, 1
 ; GFX7-NEXT:    s_lshr_b32 s12, s2, 1
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX7-NEXT:    v_and_b32_e32 v0, s0, v2
-; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
 ; GFX7-NEXT:    s_not_b32 s13, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
@@ -4012,9 +3932,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s6, 6
 ; GFX10-NEXT:    s_lshl_b32 s7, s5, 4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, s6, 7
-; GFX10-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    s_lshl_b32 s7, s8, s7
+; GFX10-NEXT:    s_lshl_b32 s7, 0xffff, s7
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, s6, 0
 ; GFX10-NEXT:    s_not_b32 s7, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
@@ -4197,19 +4116,18 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v14, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 4, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v12, v3, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e64 v12, v3, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v12
 ; GFX10-NEXT:    v_mov_b32_e32 v12, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 21457c369b3cc..3e6bc4e425246 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -722,33 +722,34 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    s_and_b32 s3, s3, 3
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    s_and_b32 s2, s2, s4
+; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX9-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX9-NEXT:    s_lshl_b32 s3, 0xff, s3
 ; GFX9-NEXT:    s_not_b32 s3, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 16
+; GFX9-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v2, v0, v1, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v4
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v4, v0, s4, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_or3_b32 v2, v4, v2, v3
+; GFX9-NEXT:    v_or3_b32 v2, v2, v3, v4
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -757,13 +758,12 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
-; GFX8-NEXT:    s_and_b32 s1, s3, 3
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX8-NEXT:    s_and_b32 s2, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s0, s3, 3
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s0
+; GFX8-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX8-NEXT:    s_not_b32 s0, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -775,7 +775,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -794,18 +794,17 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    s_and_b32 s1, s3, 3
-; GFX7-NEXT:    s_and_b32 s2, s2, s0
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
-; GFX7-NEXT:    s_lshl_b32 s1, s0, s1
-; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_and_b32 s0, s3, 3
+; GFX7-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
+; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
+; GFX7-NEXT:    s_not_b32 s0, s0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -813,11 +812,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, s1, v0
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -832,28 +831,27 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
-; GFX10-NEXT:    s_and_b32 s2, s2, s1
+; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v1
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; GFX10-NEXT:    s_and_b32 s0, s3, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v3, v1
-; GFX10-NEXT:    s_lshl_b32 s3, s1, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s2, s0
-; GFX10-NEXT:    s_not_b32 s2, s3
+; GFX10-NEXT:    s_lshl_b32 s2, 0xff, s0
+; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
+; GFX10-NEXT:    s_not_b32 s1, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, s0
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, s0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s1, v1
+; GFX10-NEXT:    v_and_or_b32 v4, 0xff, v0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -877,7 +875,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX9-NEXT:    s_and_b32 s6, s0, s5
+; GFX9-NEXT:    s_and_b32 s6, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -887,7 +885,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_or_b32 s0, s0, s3
 ; GFX9-NEXT:    s_and_b32 s3, s4, 3
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
-; GFX9-NEXT:    s_lshl_b32 s4, s5, s3
+; GFX9-NEXT:    s_lshl_b32 s4, 0xff, s3
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, s3, v1
@@ -905,24 +903,23 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-LABEL: insertelement_s_v4i8_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX8-NEXT:    s_movk_i32 s1, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x80008
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX8-NEXT:    s_and_b32 s3, s0, s1
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x80008
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX8-NEXT:    s_and_b32 s2, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX8-NEXT:    s_or_b32 s3, s3, s5
+; GFX8-NEXT:    s_or_b32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_or_b32 s0, s3, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s4, 3
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s2, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_lshl_b32 s1, 0xff, s1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -946,7 +943,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
-; GFX7-NEXT:    s_and_b32 s3, s0, s2
+; GFX7-NEXT:    s_and_b32 s3, s0, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_or_b32 s3, s3, s5
@@ -957,7 +954,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s1, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
-; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
@@ -979,30 +976,29 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX10-LABEL: insertelement_s_v4i8_v_s:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-NEXT:    s_and_b32 s1, s4, 3
-; GFX10-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s5, s0, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX10-NEXT:    s_and_b32 s4, s0, s2
+; GFX10-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX10-NEXT:    s_and_b32 s3, s0, 0xff
 ; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s4, s4, s5
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 24
-; GFX10-NEXT:    s_or_b32 s0, s4, s0
-; GFX10-NEXT:    s_lshl_b32 s4, s2, s1
-; GFX10-NEXT:    s_or_b32 s0, s0, s3
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s4
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX10-NEXT:    s_or_b32 s0, s3, s0
+; GFX10-NEXT:    s_lshl_b32 s3, 0xff, s1
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s3
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, s1, s0
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s2, v1
+; GFX10-NEXT:    v_and_or_b32 v4, v0, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1019,14 +1015,14 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-LABEL: insertelement_s_v4i8_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_mov_b32 s1, 8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX9-NEXT:    s_and_b32 s6, s0, s5
+; GFX9-NEXT:    s_and_b32 s6, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -1034,7 +1030,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_or_b32 s0, s6, s0
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s3
-; GFX9-NEXT:    s_and_b32 s3, s4, s5
+; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
@@ -1054,14 +1050,14 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-LABEL: insertelement_s_v4i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX8-NEXT:    s_movk_i32 s1, 0xff
 ; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_movk_i32 s1, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX8-NEXT:    s_and_b32 s3, s0, s1
+; GFX8-NEXT:    s_and_b32 s3, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX8-NEXT:    s_or_b32 s3, s3, s5
@@ -1069,7 +1065,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_or_b32 s0, s3, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s4, s1
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v0, s2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
@@ -1091,13 +1087,13 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-LABEL: insertelement_s_v4i8_s_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX7-NEXT:    s_movk_i32 s2, 0xff
 ; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    s_movk_i32 s2, 0xff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
-; GFX7-NEXT:    s_and_b32 s3, s0, s2
+; GFX7-NEXT:    s_and_b32 s3, s0, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_or_b32 s3, s3, s5
@@ -1105,7 +1101,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_or_b32 s0, s3, s0
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s4, s2
+; GFX7-NEXT:    s_and_b32 s1, s4, 0xff
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v0
 ; GFX7-NEXT:    v_lshl_b32_e32 v0, s2, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
@@ -1131,30 +1127,29 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
-; GFX10-NEXT:    s_and_b32 s2, s4, s1
+; GFX10-NEXT:    s_and_b32 s1, s4, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xff
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
 ; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s4, s0, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX10-NEXT:    s_and_b32 s3, s0, s1
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX10-NEXT:    s_and_b32 s2, s0, 0xff
 ; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s3, s3, s4
-; GFX10-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX10-NEXT:    s_or_b32 s0, s3, s0
-; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    v_and_or_b32 v0, s0, v1, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s1, v1
+; GFX10-NEXT:    v_and_or_b32 v4, v0, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1171,14 +1166,14 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-LABEL: insertelement_s_v4i8_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX9-NEXT:    s_and_b32 s5, s0, s4
+; GFX9-NEXT:    s_and_b32 s5, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
@@ -1205,14 +1200,14 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-LABEL: insertelement_s_v4i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX8-NEXT:    s_movk_i32 s1, 0xff
 ; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX8-NEXT:    s_movk_i32 s1, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX8-NEXT:    s_and_b32 s3, s0, s1
+; GFX8-NEXT:    s_and_b32 s3, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
@@ -1248,7 +1243,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
-; GFX7-NEXT:    s_and_b32 s3, s0, s2
+; GFX7-NEXT:    s_and_b32 s3, s0, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_or_b32 s3, s3, s4
@@ -1281,29 +1276,28 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 3, v1
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s4, s0, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX10-NEXT:    s_and_b32 s3, s0, s1
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX10-NEXT:    s_and_b32 s2, s0, 0xff
 ; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s3, s3, s4
-; GFX10-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX10-NEXT:    s_or_b32 s0, s3, s0
-; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    v_and_or_b32 v0, s0, v1, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s1, v1
+; GFX10-NEXT:    v_and_or_b32 v4, v0, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1321,62 +1315,63 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
-; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    s_movk_i32 s3, 0xff
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v4, v2, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v5, v2, s2
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s3
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v6
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v4, v0, s3, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v3, v0, v1, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_or3_b32 v2, v4, v3, v2
+; GFX9-NEXT:    v_or3_b32 v2, v3, v4, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v4i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v1, 8
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v3, 16
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v4, 16
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-NEXT:    v_mov_b32_e32 v5, 16
+; GFX8-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, 16
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v7
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
 ; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
@@ -1392,37 +1387,37 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX7-NEXT:    s_and_b32 s1, s2, s0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
-; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xff
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, s0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v2, v1
+; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, s0, v0
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, v0, v1
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1431,17 +1426,16 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v1, 3, v2
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v1, s1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v1, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; GFX10-NEXT:    s_and_b32 s0, s2, s1
+; GFX10-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v5, v2
@@ -1451,7 +1445,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v0, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1474,40 +1468,40 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshl_b32 s2, 0xff, s2
 ; GFX9-NEXT:    s_not_b32 s2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v5
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s2, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v4, v0, s3, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v3, v0, v1, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_or3_b32 v2, v4, v3, v2
+; GFX9-NEXT:    v_or3_b32 v2, v3, v4, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v4i8_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_and_b32 s1, s2, 3
+; GFX8-NEXT:    s_and_b32 s0, s2, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v6, s1
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX8-NEXT:    s_not_b32 s0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 16
@@ -1544,7 +1538,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v1, s0, v2
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s1, v1
-; GFX7-NEXT:    s_lshl_b32 s1, s0, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
 ; GFX7-NEXT:    s_not_b32 s1, s1
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -1562,7 +1556,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -1577,27 +1571,26 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_and_b32 s2, s2, 3
+; GFX10-NEXT:    s_and_b32 s1, s2, 3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    s_lshl_b32 s1, s2, 3
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s0, v1
+; GFX10-NEXT:    s_mov_b32 s0, 16
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    s_lshl_b32 s1, s0, s1
-; GFX10-NEXT:    s_not_b32 s1, s1
+; GFX10-NEXT:    s_lshl_b32 s0, s1, 3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v4, v1
+; GFX10-NEXT:    s_not_b32 s0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s0, v1
+; GFX10-NEXT:    v_and_or_b32 v4, 0xff, v0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1725,9 +1718,8 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v1, 3, v3
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v1, s1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v1, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1735,7 +1727,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 8
@@ -1761,28 +1753,25 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-LABEL: insertelement_s_v8i8_s_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s8, 0x80008
-; GFX9-NEXT:    s_movk_i32 s6, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s9, s0, s8
-; GFX9-NEXT:    s_and_b32 s7, s0, s6
-; GFX9-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_mov_b32 s9, 0x80010
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s9
+; GFX9-NEXT:    s_and_b32 s6, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_or_b32 s0, s7, s0
+; GFX9-NEXT:    s_or_b32 s0, s6, s0
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX9-NEXT:    s_bfe_u32 s7, s1, s8
+; GFX9-NEXT:    s_bfe_u32 s6, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s1, s6
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
-; GFX9-NEXT:    s_or_b32 s2, s2, s7
+; GFX9-NEXT:    s_and_b32 s2, s1, 0xff
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s2, s1
 ; GFX9-NEXT:    s_lshl_b32 s2, s3, 24
@@ -1792,30 +1781,30 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX9-NEXT:    s_and_b32 s5, s5, 3
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX9-NEXT:    s_and_b32 s4, s4, s6
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX9-NEXT:    s_lshl_b32 s5, s6, s5
+; GFX9-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; GFX9-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b32 s0, s3, s0
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX9-NEXT:    s_bfe_u32 s5, s0, s8
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX9-NEXT:    s_and_b32 s4, s0, s6
+; GFX9-NEXT:    s_and_b32 s4, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s9
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s4, s0
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX9-NEXT:    s_bfe_u32 s4, s1, s8
+; GFX9-NEXT:    s_bfe_u32 s4, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s1, s6
+; GFX9-NEXT:    s_and_b32 s2, s1, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX9-NEXT:    s_or_b32 s2, s2, s4
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s2, s1
@@ -1829,28 +1818,25 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-LABEL: insertelement_s_v8i8_s_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX8-NEXT:    s_mov_b32 s8, 0x80008
-; GFX8-NEXT:    s_movk_i32 s6, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s9, s0, s8
-; GFX8-NEXT:    s_and_b32 s7, s0, s6
-; GFX8-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX8-NEXT:    s_or_b32 s7, s7, s9
-; GFX8-NEXT:    s_mov_b32 s9, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s9
+; GFX8-NEXT:    s_and_b32 s6, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX8-NEXT:    s_or_b32 s6, s6, s7
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_or_b32 s0, s7, s0
+; GFX8-NEXT:    s_or_b32 s0, s6, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_bfe_u32 s7, s1, s8
+; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s1, s6
-; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s9
-; GFX8-NEXT:    s_or_b32 s2, s2, s7
+; GFX8-NEXT:    s_and_b32 s2, s1, 0xff
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 24
@@ -1860,30 +1846,30 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX8-NEXT:    s_and_b32 s5, s5, 3
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX8-NEXT:    s_and_b32 s4, s4, s6
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX8-NEXT:    s_lshl_b32 s5, s6, s5
+; GFX8-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; GFX8-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
 ; GFX8-NEXT:    s_cselect_b32 s0, s3, s0
 ; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX8-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX8-NEXT:    s_bfe_u32 s5, s0, s8
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX8-NEXT:    s_and_b32 s4, s0, s6
+; GFX8-NEXT:    s_and_b32 s4, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s9
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX8-NEXT:    s_or_b32 s4, s4, s5
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_bfe_u32 s4, s1, s8
+; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s1, s6
+; GFX8-NEXT:    s_and_b32 s2, s1, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s9
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
@@ -1897,26 +1883,23 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-LABEL: insertelement_s_v8i8_s_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s8, 0x80008
-; GFX7-NEXT:    s_movk_i32 s6, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s9, s0, s8
-; GFX7-NEXT:    s_and_b32 s7, s0, s6
-; GFX7-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX7-NEXT:    s_or_b32 s7, s7, s9
-; GFX7-NEXT:    s_mov_b32 s9, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s9
+; GFX7-NEXT:    s_and_b32 s6, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s7, s0
+; GFX7-NEXT:    s_or_b32 s0, s6, s0
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_bfe_u32 s7, s1, s8
+; GFX7-NEXT:    s_bfe_u32 s6, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s2, s1, s6
-; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s9
-; GFX7-NEXT:    s_or_b32 s2, s2, s7
+; GFX7-NEXT:    s_and_b32 s2, s1, 0xff
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX7-NEXT:    s_or_b32 s2, s2, s6
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
 ; GFX7-NEXT:    s_lshl_b32 s2, s3, 24
@@ -1926,30 +1909,30 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX7-NEXT:    s_and_b32 s5, s5, 3
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX7-NEXT:    s_and_b32 s4, s4, s6
+; GFX7-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX7-NEXT:    s_lshl_b32 s5, s6, s5
+; GFX7-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; GFX7-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX7-NEXT:    s_or_b32 s3, s3, s4
 ; GFX7-NEXT:    s_cmp_eq_u32 s2, 0
 ; GFX7-NEXT:    s_cselect_b32 s4, s3, s0
 ; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX7-NEXT:    s_cselect_b32 s3, s3, s1
-; GFX7-NEXT:    s_bfe_u32 s10, s4, s8
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s2, s4, 24
-; GFX7-NEXT:    s_and_b32 s7, s4, s6
-; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX7-NEXT:    s_bfe_u32 s4, s4, s9
-; GFX7-NEXT:    s_or_b32 s7, s7, s10
+; GFX7-NEXT:    s_and_b32 s6, s4, 0xff
+; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX7-NEXT:    s_or_b32 s4, s7, s4
+; GFX7-NEXT:    s_or_b32 s4, s6, s4
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_or_b32 s2, s4, s2
-; GFX7-NEXT:    s_and_b32 s4, s3, s6
-; GFX7-NEXT:    s_bfe_u32 s6, s3, s8
+; GFX7-NEXT:    s_bfe_u32 s6, s3, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s5, s3, 24
+; GFX7-NEXT:    s_or_b32 s2, s4, s2
+; GFX7-NEXT:    s_and_b32 s4, s3, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX7-NEXT:    s_bfe_u32 s3, s3, s9
+; GFX7-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX7-NEXT:    s_or_b32 s4, s4, s6
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s3, s4, s3
@@ -1966,68 +1949,65 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-LABEL: insertelement_s_v8i8_s_s:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
-; GFX10-NEXT:    s_mov_b32 s6, 0x80010
-; GFX10-NEXT:    s_lshr_b32 s7, s5, 2
+; GFX10-NEXT:    s_lshr_b32 s2, s5, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s11, s0, s3
-; GFX10-NEXT:    s_bfe_u32 s13, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX10-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX10-NEXT:    s_and_b32 s10, s0, s2
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s6
-; GFX10-NEXT:    s_and_b32 s12, s1, s2
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s6
-; GFX10-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
+; GFX10-NEXT:    s_bfe_u32 s8, s0, 0x80008
+; GFX10-NEXT:    s_bfe_u32 s10, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX10-NEXT:    s_and_b32 s7, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_and_b32 s9, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX10-NEXT:    s_lshl_b32 s10, s10, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s10, s10, s11
-; GFX10-NEXT:    s_or_b32 s11, s12, s13
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
-; GFX10-NEXT:    s_or_b32 s0, s10, s0
-; GFX10-NEXT:    s_or_b32 s1, s11, s1
-; GFX10-NEXT:    s_or_b32 s0, s0, s8
-; GFX10-NEXT:    s_or_b32 s1, s1, s9
-; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX10-NEXT:    s_cselect_b32 s8, s1, s0
+; GFX10-NEXT:    s_or_b32 s7, s7, s8
+; GFX10-NEXT:    s_or_b32 s8, s9, s10
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX10-NEXT:    s_or_b32 s0, s7, s0
+; GFX10-NEXT:    s_or_b32 s1, s8, s1
+; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_or_b32 s1, s1, s6
+; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX10-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX10-NEXT:    s_and_b32 s5, s5, 3
-; GFX10-NEXT:    s_and_b32 s4, s4, s2
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s9, s2, s5
+; GFX10-NEXT:    s_lshl_b32 s6, 0xff, s5
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX10-NEXT:    s_andn2_b32 s5, s8, s9
-; GFX10-NEXT:    s_or_b32 s4, s5, s4
-; GFX10-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX10-NEXT:    s_cselect_b32 s0, s4, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX10-NEXT:    s_cselect_b32 s1, s4, s1
-; GFX10-NEXT:    s_bfe_u32 s7, s0, s3
-; GFX10-NEXT:    s_bfe_u32 s3, s1, s3
-; GFX10-NEXT:    s_and_b32 s5, s0, s2
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX10-NEXT:    s_and_b32 s2, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s6
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s6
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX10-NEXT:    s_or_b32 s2, s2, s3
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
+; GFX10-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX10-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX10-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX10-NEXT:    s_and_b32 s3, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s5, s5, s7
-; GFX10-NEXT:    s_or_b32 s1, s2, s1
-; GFX10-NEXT:    s_lshl_b32 s2, s8, 24
-; GFX10-NEXT:    s_or_b32 s0, s5, s0
-; GFX10-NEXT:    s_lshl_b32 s3, s4, 24
-; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_or_b32 s0, s0, s3
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_and_b32 s6, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
+; GFX10-NEXT:    s_lshl_b32 s4, s7, 8
+; GFX10-NEXT:    s_or_b32 s0, s3, s0
+; GFX10-NEXT:    s_or_b32 s3, s6, s4
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX10-NEXT:    s_or_b32 s1, s3, s1
+; GFX10-NEXT:    s_lshl_b32 s3, s5, 24
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@@ -2045,47 +2025,48 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s5, s3, 2
 ; GFX9-NEXT:    s_and_b32 s3, s3, 3
-; GFX9-NEXT:    s_and_b32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX9-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX9-NEXT:    s_lshl_b32 s3, 0xff, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX9-NEXT:    s_not_b32 s3, s3
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_mov_b32_e32 v5, 16
+; GFX9-NEXT:    v_mov_b32_e32 v7, s2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 8
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v6, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_or3_b32 v0, v0, v11, v8
+; GFX9-NEXT:    v_or3_b32 v1, v1, v13, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v7, v8, s3, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v9
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v7
-; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v6, v7, s3, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
-; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v6
-; GFX9-NEXT:    v_or3_b32 v1, v1, v5, v4
+; GFX9-NEXT:    v_or3_b32 v1, v1, v6, v4
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2094,15 +2075,14 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 16
-; GFX8-NEXT:    s_lshr_b32 s1, s3, 2
-; GFX8-NEXT:    s_and_b32 s3, s3, 3
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 3
-; GFX8-NEXT:    s_and_b32 s2, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX8-NEXT:    s_not_b32 s0, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, s3, 3
+; GFX8-NEXT:    s_lshr_b32 s0, s3, 2
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX8-NEXT:    s_lshl_b32 s1, 0xff, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT:    s_not_b32 s1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
@@ -2123,9 +2103,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, s1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, s2, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -2151,63 +2131,64 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s6, 0xff
-; GFX7-NEXT:    s_and_b32 s1, s3, 3
-; GFX7-NEXT:    s_lshr_b32 s0, s3, 2
-; GFX7-NEXT:    s_and_b32 s2, s2, s6
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
-; GFX7-NEXT:    s_lshl_b32 s1, s6, s1
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    s_lshr_b32 s1, s3, 2
+; GFX7-NEXT:    s_and_b32 s3, s3, 3
+; GFX7-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX7-NEXT:    s_lshl_b32 s3, 0xff, s3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX7-NEXT:    s_not_b32 s3, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v0
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, s0, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v6, s6, v1
+; GFX7-NEXT:    v_and_b32_e32 v7, s0, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v7, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v2, s1, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, s3, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, s2, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, v0, v2
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v6, s6, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2216,8 +2197,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    s_and_b32 s2, s2, s4
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -2225,9 +2205,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v5
+; GFX10-NEXT:    v_and_or_b32 v1, v1, 0xff, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    s_lshr_b32 s0, s3, 2
 ; GFX10-NEXT:    s_and_b32 s1, s3, 3
@@ -2236,7 +2216,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v3
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s1
+; GFX10-NEXT:    s_lshl_b32 s3, 0xff, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
 ; GFX10-NEXT:    s_not_b32 s2, s3
@@ -2251,9 +2231,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, v5
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v0, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v3, v1, s4, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v1, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2271,28 +2251,27 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-LABEL: insertelement_s_v8i8_v_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s9, 0x80008
-; GFX9-NEXT:    s_movk_i32 s7, 0xff
-; GFX9-NEXT:    v_and_b32_e32 v0, s7, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    s_mov_b32 s2, 8
+; GFX9-NEXT:    s_mov_b32 s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s10, s0, s9
-; GFX9-NEXT:    s_and_b32 s8, s0, s7
-; GFX9-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX9-NEXT:    s_or_b32 s8, s8, s10
-; GFX9-NEXT:    s_mov_b32 s10, 0x80010
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s10
+; GFX9-NEXT:    s_and_b32 s7, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_or_b32 s0, s8, s0
+; GFX9-NEXT:    s_or_b32 s0, s7, s0
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX9-NEXT:    s_bfe_u32 s8, s1, s9
+; GFX9-NEXT:    s_bfe_u32 s7, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s5
-; GFX9-NEXT:    s_and_b32 s5, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s10
-; GFX9-NEXT:    s_or_b32 s5, s5, s8
+; GFX9-NEXT:    s_and_b32 s5, s1, 0xff
+; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT:    s_or_b32 s5, s5, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s5, s1
 ; GFX9-NEXT:    s_lshl_b32 s5, s6, 24
@@ -2302,59 +2281,55 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX9-NEXT:    s_and_b32 s4, s4, 3
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX9-NEXT:    s_lshl_b32 s8, s7, s4
-; GFX9-NEXT:    s_andn2_b32 s6, s6, s8
+; GFX9-NEXT:    s_lshl_b32 s7, 0xff, s4
+; GFX9-NEXT:    s_andn2_b32 s6, s6, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_lshl_or_b32 v2, v0, s4, v1
+; GFX9-NEXT:    v_lshl_or_b32 v3, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX9-NEXT:    s_mov_b32 s3, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v4, v0, s7, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v5, v0, v2, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v4, v1, s7, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v5, v0, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v5, v1, v2, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
+; GFX9-NEXT:    v_or3_b32 v1, v5, v1, v4
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_s_v8i8_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX8-NEXT:    s_mov_b32 s7, 0x80008
-; GFX8-NEXT:    s_movk_i32 s5, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 16
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s8, s0, s7
-; GFX8-NEXT:    s_and_b32 s6, s0, s5
-; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX8-NEXT:    s_or_b32 s6, s6, s8
-; GFX8-NEXT:    s_mov_b32 s8, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s8
+; GFX8-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX8-NEXT:    s_or_b32 s5, s5, s6
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_or_b32 s0, s6, s0
+; GFX8-NEXT:    s_or_b32 s0, s5, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_bfe_u32 s6, s1, s7
+; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s1, s5
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s8
-; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_and_b32 s2, s1, 0xff
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX8-NEXT:    s_or_b32 s2, s2, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 24
@@ -2365,7 +2340,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s4, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX8-NEXT:    s_lshl_b32 s4, 0xff, s4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_andn2_b32 s3, s3, s4
 ; GFX8-NEXT:    v_or_b32_e32 v2, s3, v0
@@ -2397,27 +2372,25 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-LABEL: insertelement_s_v8i8_v_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s7, 0x80008
-; GFX7-NEXT:    s_movk_i32 s5, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s8, s0, s7
-; GFX7-NEXT:    s_and_b32 s6, s0, s5
-; GFX7-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX7-NEXT:    s_or_b32 s6, s6, s8
-; GFX7-NEXT:    s_mov_b32 s8, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s6, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s8
+; GFX7-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s6, s0
+; GFX7-NEXT:    s_or_b32 s0, s5, s0
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_bfe_u32 s6, s1, s7
+; GFX7-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s2, s1, s5
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s8
-; GFX7-NEXT:    s_or_b32 s2, s2, s6
+; GFX7-NEXT:    s_and_b32 s2, s1, 0xff
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX7-NEXT:    s_or_b32 s2, s2, s5
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
 ; GFX7-NEXT:    s_lshl_b32 s2, s3, 24
@@ -2428,34 +2401,34 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s4, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX7-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX7-NEXT:    s_lshl_b32 s4, 0xff, s4
 ; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
-; GFX7-NEXT:    v_or_b32_e32 v2, s3, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, s3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, s5, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s5, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_bfe_u32 v3, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -2466,57 +2439,54 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-LABEL: insertelement_s_v8i8_v_s:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
-; GFX10-NEXT:    s_mov_b32 s5, 0x80010
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 2
-; GFX10-NEXT:    v_and_b32_e32 v2, s2, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s6, 0
+; GFX10-NEXT:    s_lshr_b32 s2, s4, 2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s10, s0, s3
-; GFX10-NEXT:    s_bfe_u32 s3, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX10-NEXT:    s_and_b32 s9, s0, s2
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s5
-; GFX10-NEXT:    s_and_b32 s11, s1, s2
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s5
-; GFX10-NEXT:    s_lshl_b32 s5, s10, 8
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_bfe_u32 s7, s0, 0x80008
+; GFX10-NEXT:    s_bfe_u32 s9, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX10-NEXT:    s_and_b32 s6, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_and_b32 s8, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX10-NEXT:    s_lshl_b32 s9, s9, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s5, s9, s5
-; GFX10-NEXT:    s_or_b32 s3, s11, s3
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 24
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_or_b32 s0, s5, s0
-; GFX10-NEXT:    s_or_b32 s1, s3, s1
-; GFX10-NEXT:    s_or_b32 s0, s0, s7
-; GFX10-NEXT:    s_or_b32 s1, s1, s8
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_or_b32 s7, s8, s9
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX10-NEXT:    s_or_b32 s0, s6, s0
+; GFX10-NEXT:    s_or_b32 s1, s7, s1
+; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_or_b32 s1, s1, s5
+; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_lshl_b32 s5, s2, s4
+; GFX10-NEXT:    s_lshl_b32 s5, 0xff, s4
 ; GFX10-NEXT:    s_mov_b32 s1, 16
 ; GFX10-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v2, s4, s3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s6, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v0, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s2, v5
+; GFX10-NEXT:    v_and_or_b32 v5, 0xff, v1, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or3_b32 v2, v3, v6, v2
@@ -2533,27 +2503,26 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-LABEL: insertelement_s_v8i8_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s9, 0x80008
-; GFX9-NEXT:    s_movk_i32 s7, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_movk_i32 s7, 0xff
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s10, s0, s9
-; GFX9-NEXT:    s_and_b32 s8, s0, s7
-; GFX9-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX9-NEXT:    s_or_b32 s8, s8, s10
-; GFX9-NEXT:    s_mov_b32 s10, 0x80010
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s10
+; GFX9-NEXT:    s_and_b32 s8, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_or_b32 s8, s8, s9
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s8, s0
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX9-NEXT:    s_bfe_u32 s8, s1, s9
+; GFX9-NEXT:    s_bfe_u32 s8, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s5
-; GFX9-NEXT:    s_and_b32 s5, s1, s7
+; GFX9-NEXT:    s_and_b32 s5, s1, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s10
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX9-NEXT:    s_or_b32 s5, s5, s8
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s5, s1
@@ -2561,9 +2530,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_or_b32 s1, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_and_b32 s4, s4, s7
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s7
@@ -2575,16 +2543,17 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_mov_b32 s2, 8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_and_or_b32 v4, v0, s7, v4
+; GFX9-NEXT:    v_and_or_b32 v5, v0, v4, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
+; GFX9-NEXT:    v_or3_b32 v0, v5, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v4, v1, s7, v2
+; GFX9-NEXT:    v_and_or_b32 v4, v1, v4, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2596,27 +2565,26 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-LABEL: insertelement_s_v8i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX8-NEXT:    s_mov_b32 s7, 0x80008
-; GFX8-NEXT:    s_movk_i32 s5, 0xff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_movk_i32 s5, 0xff
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s8, s0, s7
-; GFX8-NEXT:    s_and_b32 s6, s0, s5
-; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX8-NEXT:    s_or_b32 s6, s6, s8
-; GFX8-NEXT:    s_mov_b32 s8, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s8
+; GFX8-NEXT:    s_and_b32 s6, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX8-NEXT:    s_or_b32 s6, s6, s7
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s6, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_bfe_u32 s6, s1, s7
+; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s1, s5
+; GFX8-NEXT:    s_and_b32 s2, s1, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
@@ -2624,9 +2592,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    s_and_b32 s2, s4, s5
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
@@ -2662,27 +2629,26 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-LABEL: insertelement_s_v8i8_s_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s7, 0x80008
-; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    s_movk_i32 s5, 0xff
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s8, s0, s7
-; GFX7-NEXT:    s_and_b32 s6, s0, s5
-; GFX7-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX7-NEXT:    s_or_b32 s6, s6, s8
-; GFX7-NEXT:    s_mov_b32 s8, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s8
+; GFX7-NEXT:    s_and_b32 s6, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s6, s0
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_bfe_u32 s6, s1, s7
+; GFX7-NEXT:    s_bfe_u32 s6, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s2, s1, s5
+; GFX7-NEXT:    s_and_b32 s2, s1, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX7-NEXT:    s_or_b32 s2, s2, s6
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
@@ -2690,9 +2656,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT:    s_and_b32 s2, s4, s5
+; GFX7-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX7-NEXT:    v_lshl_b32_e32 v3, s2, v0
 ; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
@@ -2716,7 +2681,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s5, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -2734,38 +2699,35 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 3, v0
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
-; GFX10-NEXT:    s_mov_b32 s5, 0x80010
-; GFX10-NEXT:    s_and_b32 s4, s4, s2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX10-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v1, s2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v1, 0xff
 ; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s8, s0, s3
-; GFX10-NEXT:    s_bfe_u32 s3, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 24
-; GFX10-NEXT:    s_and_b32 s9, s1, s2
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s5
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX10-NEXT:    s_and_b32 s6, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s3, s9, s3
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX10-NEXT:    s_or_b32 s1, s3, s1
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX10-NEXT:    s_and_b32 s7, s0, s2
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s5
-; GFX10-NEXT:    s_lshl_b32 s5, s8, 8
-; GFX10-NEXT:    s_or_b32 s1, s1, s6
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_bfe_u32 s5, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX10-NEXT:    s_or_b32 s1, s6, s1
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX10-NEXT:    s_and_b32 s4, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_lshl_b32 s3, s4, 24
-; GFX10-NEXT:    s_or_b32 s4, s7, s5
+; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_or_b32 s0, s4, s0
-; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX10-NEXT:    s_or_b32 s0, s3, s0
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, s0, v1, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
@@ -2781,9 +2743,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v0, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s2, v5
+; GFX10-NEXT:    v_and_or_b32 v5, 0xff, v1, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2801,33 +2763,31 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-LABEL: insertelement_s_v8i8_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s8, 0x80008
-; GFX9-NEXT:    s_movk_i32 s6, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s9, s0, s8
-; GFX9-NEXT:    s_and_b32 s7, s0, s6
-; GFX9-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_mov_b32 s9, 0x80010
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s9
+; GFX9-NEXT:    s_and_b32 s7, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s7, s0
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX9-NEXT:    s_bfe_u32 s7, s1, s8
+; GFX9-NEXT:    s_bfe_u32 s7, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s4
-; GFX9-NEXT:    s_and_b32 s4, s1, s6
+; GFX9-NEXT:    s_and_b32 s4, s1, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s4, s1
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
@@ -2842,16 +2802,17 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_mov_b32 s2, 8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_and_or_b32 v4, v0, s6, v4
+; GFX9-NEXT:    v_and_or_b32 v5, v0, v4, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
+; GFX9-NEXT:    v_or3_b32 v0, v5, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v4, v1, s6, v2
+; GFX9-NEXT:    v_and_or_b32 v4, v1, v4, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2863,33 +2824,31 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-LABEL: insertelement_s_v8i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX8-NEXT:    s_mov_b32 s6, 0x80008
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s7, s0, s6
-; GFX8-NEXT:    s_and_b32 s5, s0, s4
-; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX8-NEXT:    s_or_b32 s5, s5, s7
-; GFX8-NEXT:    s_mov_b32 s7, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s7
+; GFX8-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX8-NEXT:    s_or_b32 s5, s5, s6
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s5, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_bfe_u32 s5, s1, s6
+; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s1, s4
+; GFX8-NEXT:    s_and_b32 s2, s1, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s7
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s2, s2, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
@@ -2928,33 +2887,31 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-LABEL: insertelement_s_v8i8_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s6, 0x80008
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s7, s0, s6
-; GFX7-NEXT:    s_and_b32 s5, s0, s4
-; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX7-NEXT:    s_or_b32 s5, s5, s7
-; GFX7-NEXT:    s_mov_b32 s7, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s6, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s7
+; GFX7-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s5, s0
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_bfe_u32 s5, s1, s6
+; GFX7-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s2, s1, s4
+; GFX7-NEXT:    s_and_b32 s2, s1, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s7
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX7-NEXT:    s_or_b32 s2, s2, s5
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
 ; GFX7-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s1
@@ -2982,7 +2939,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -2999,38 +2956,35 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-LABEL: insertelement_s_v8i8_v_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
 ; GFX10-NEXT:    v_and_b32_e32 v2, 3, v1
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
-; GFX10-NEXT:    s_mov_b32 s4, 0x80010
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v2, s2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v2, 0xff
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s8, s0, s3
-; GFX10-NEXT:    s_bfe_u32 s3, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 24
-; GFX10-NEXT:    s_and_b32 s9, s1, s2
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s4
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX10-NEXT:    s_and_b32 s6, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s3, s9, s3
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX10-NEXT:    s_or_b32 s1, s3, s1
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_and_b32 s7, s0, s2
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s4
-; GFX10-NEXT:    s_lshl_b32 s4, s8, 8
-; GFX10-NEXT:    s_or_b32 s1, s1, s6
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_bfe_u32 s5, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX10-NEXT:    s_or_b32 s1, s6, s1
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX10-NEXT:    s_and_b32 s4, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s4, s7, s4
+; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_lshl_b32 s3, s5, 24
-; GFX10-NEXT:    s_or_b32 s0, s4, s0
-; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX10-NEXT:    s_or_b32 s0, s3, s0
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, s0, v1, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
@@ -3046,9 +3000,9 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v0, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s2, v5
+; GFX10-NEXT:    v_and_or_b32 v5, 0xff, v1, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3069,102 +3023,103 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
 ; GFX9-NEXT:    s_movk_i32 s3, 0xff
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 2, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v8, v2, s2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_lshlrev_b32_e64 v9, v2, s2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 16
+; GFX9-NEXT:    v_mov_b32_e32 v6, 8
+; GFX9-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_or3_b32 v0, v0, v12, v9
-; GFX9-NEXT:    v_or3_b32 v1, v1, v14, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v9, v2, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_or3_b32 v0, v0, v13, v10
+; GFX9-NEXT:    v_or3_b32 v1, v1, v15, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v10, v2, v9
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v5, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v2
-; GFX9-NEXT:    v_or3_b32 v1, v1, v6, v5
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v5, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v2
+; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v5
 ; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v5, 8
-; GFX8-NEXT:    v_mov_b32_e32 v6, 16
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v6, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 2, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v7, 16
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v10, v2, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX8-NEXT:    v_lshlrev_b32_e64 v11, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT:    v_mov_b32_e32 v7, 8
-; GFX8-NEXT:    v_mov_b32_e32 v8, 16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
+; GFX8-NEXT:    v_mov_b32_e32 v8, 8
+; GFX8-NEXT:    v_mov_b32_e32 v9, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v12
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v12
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
@@ -3176,63 +3131,64 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s3, 0xff
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX7-NEXT:    s_and_b32 s0, s2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX7-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, s0, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s3, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v5, s1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v8, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v10, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v7, s3, v0
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, s0, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v9, s3, v1
+; GFX7-NEXT:    v_and_b32_e32 v10, s0, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v0, v7, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v8, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, s3, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, v0, v3
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v6, s3, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, v1, v3
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -3242,7 +3198,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v2
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
@@ -3253,12 +3208,12 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v7
+; GFX10-NEXT:    v_and_or_b32 v1, v1, 0xff, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v3, s3
-; GFX10-NEXT:    s_and_b32 s0, s2, s3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v3, 0xff
+; GFX10-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v8, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v9, v5
@@ -3276,9 +3231,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s3, v5
+; GFX10-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s3, v2
+; GFX10-NEXT:    v_and_or_b32 v8, 0xff, v1, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3303,60 +3258,60 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshl_b32 s2, 0xff, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    s_not_b32 s2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 16
+; GFX9-NEXT:    v_mov_b32_e32 v6, 8
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v7
-; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v7, s2, v2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_or3_b32 v0, v0, v11, v8
+; GFX9-NEXT:    v_or3_b32 v1, v1, v13, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v8, s2, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v5, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v2
-; GFX9-NEXT:    v_or3_b32 v1, v1, v6, v5
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v5, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v2
+; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v5
 ; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i8_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_lshr_b32 s1, s2, 2
-; GFX8-NEXT:    s_and_b32 s2, s2, 3
+; GFX8-NEXT:    s_and_b32 s1, s2, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 8
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 16
-; GFX8-NEXT:    v_mov_b32_e32 v9, s2
+; GFX8-NEXT:    v_mov_b32_e32 v9, s1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 2
+; GFX8-NEXT:    s_lshl_b32 s1, 0xff, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT:    s_not_b32 s1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
@@ -3377,9 +3332,9 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, s1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -3405,63 +3360,64 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s3, 0xff
-; GFX7-NEXT:    s_and_b32 s1, s2, 3
-; GFX7-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX7-NEXT:    v_and_b32_e32 v2, s3, v2
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v2
-; GFX7-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX7-NEXT:    s_lshr_b32 s1, s2, 2
+; GFX7-NEXT:    s_and_b32 s2, s2, 3
+; GFX7-NEXT:    v_and_b32_e32 v2, v2, v3
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
+; GFX7-NEXT:    s_lshl_b32 s2, 0xff, s2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX7-NEXT:    s_not_b32 s2, s2
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v5, s3, v0
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, s0, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v7, s3, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, s0, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v3, s1, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v6, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v7, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, s3, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, v0, v3
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v6, s3, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, v1, v3
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -3470,7 +3426,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -3478,9 +3433,9 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v4
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v6
+; GFX10-NEXT:    v_and_or_b32 v1, v1, 0xff, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX10-NEXT:    s_lshr_b32 s1, s2, 2
 ; GFX10-NEXT:    s_and_b32 s0, s2, 3
@@ -3489,7 +3444,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v8, v4
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    s_lshl_b32 s0, s3, s0
+; GFX10-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc_lo
 ; GFX10-NEXT:    s_not_b32 s0, s0
 ; GFX10-NEXT:    v_and_or_b32 v2, v3, s0, v2
@@ -3505,8 +3460,8 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s3, v5
-; GFX10-NEXT:    v_and_or_b32 v3, v1, s3, v3
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v0, v5
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v1, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3700,48 +3655,46 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_and_b32_e32 v4, 3, v3
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s2, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, v4, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX10-NEXT:    v_and_or_b32 v1, v1, 0xff, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xff
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v3
-; GFX10-NEXT:    v_or3_b32 v0, v0, v10, v6
+; GFX10-NEXT:    v_or3_b32 v0, v0, v9, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 8
-; GFX10-NEXT:    v_or3_b32 v1, v1, v11, v7
-; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v2, v6, v4, v2
+; GFX10-NEXT:    v_or3_b32 v1, v1, v10, v6
+; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v2, v5, v4, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v2, v0, v5, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v0, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v3, v1, v5, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or3_b32 v2, v2, v8, v4
-; GFX10-NEXT:    v_or3_b32 v3, v3, v9, v5
+; GFX10-NEXT:    v_or3_b32 v2, v2, v7, v4
+; GFX10-NEXT:    v_or3_b32 v3, v3, v8, v5
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
@@ -3754,47 +3707,44 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-LABEL: insertelement_s_v16i8_s_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s12, 0x80008
-; GFX9-NEXT:    s_movk_i32 s10, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s13, s0, s12
-; GFX9-NEXT:    s_and_b32 s11, s0, s10
-; GFX9-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX9-NEXT:    s_or_b32 s11, s11, s13
-; GFX9-NEXT:    s_mov_b32 s13, 0x80010
+; GFX9-NEXT:    s_bfe_u32 s11, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s13
+; GFX9-NEXT:    s_and_b32 s10, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_or_b32 s10, s10, s11
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_or_b32 s0, s11, s0
+; GFX9-NEXT:    s_or_b32 s0, s10, s0
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX9-NEXT:    s_bfe_u32 s11, s1, s12
+; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s6
-; GFX9-NEXT:    s_and_b32 s6, s1, s10
-; GFX9-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
-; GFX9-NEXT:    s_or_b32 s6, s6, s11
+; GFX9-NEXT:    s_and_b32 s6, s1, 0xff
+; GFX9-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT:    s_or_b32 s6, s6, s10
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s6, s1
 ; GFX9-NEXT:    s_lshl_b32 s6, s7, 24
-; GFX9-NEXT:    s_bfe_u32 s7, s2, s12
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s6
-; GFX9-NEXT:    s_and_b32 s6, s2, s10
+; GFX9-NEXT:    s_and_b32 s6, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_bfe_u32 s2, s2, s13
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s2, s6, s2
 ; GFX9-NEXT:    s_lshl_b32 s6, s8, 24
-; GFX9-NEXT:    s_bfe_u32 s7, s3, s12
+; GFX9-NEXT:    s_bfe_u32 s7, s3, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s9, s3, 24
 ; GFX9-NEXT:    s_or_b32 s2, s2, s6
-; GFX9-NEXT:    s_and_b32 s6, s3, s10
+; GFX9-NEXT:    s_and_b32 s6, s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_bfe_u32 s3, s3, s13
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX9-NEXT:    s_or_b32 s3, s6, s3
@@ -3809,9 +3759,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_cselect_b32 s7, s3, s7
 ; GFX9-NEXT:    s_and_b32 s5, s5, 3
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX9-NEXT:    s_and_b32 s4, s4, s10
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX9-NEXT:    s_lshl_b32 s5, s10, s5
+; GFX9-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; GFX9-NEXT:    s_andn2_b32 s5, s7, s5
 ; GFX9-NEXT:    s_or_b32 s4, s5, s4
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
@@ -3822,41 +3772,41 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX9-NEXT:    s_bfe_u32 s9, s0, s12
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX9-NEXT:    s_and_b32 s8, s0, s10
+; GFX9-NEXT:    s_and_b32 s8, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s13
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s8, s8, s9
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s8, s0
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX9-NEXT:    s_bfe_u32 s8, s1, s12
+; GFX9-NEXT:    s_bfe_u32 s8, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s4
-; GFX9-NEXT:    s_and_b32 s4, s1, s10
+; GFX9-NEXT:    s_and_b32 s4, s1, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX9-NEXT:    s_or_b32 s4, s4, s8
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s4, s1
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, 24
-; GFX9-NEXT:    s_bfe_u32 s5, s2, s12
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s6, s2, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s4
-; GFX9-NEXT:    s_and_b32 s4, s2, s10
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX9-NEXT:    s_bfe_u32 s2, s2, s13
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s2, s4, s2
 ; GFX9-NEXT:    s_lshl_b32 s4, s6, 24
-; GFX9-NEXT:    s_bfe_u32 s5, s3, s12
+; GFX9-NEXT:    s_bfe_u32 s5, s3, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s7, s3, 24
 ; GFX9-NEXT:    s_or_b32 s2, s2, s4
-; GFX9-NEXT:    s_and_b32 s4, s3, s10
+; GFX9-NEXT:    s_and_b32 s4, s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX9-NEXT:    s_bfe_u32 s3, s3, s13
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX9-NEXT:    s_or_b32 s3, s4, s3
@@ -3872,47 +3822,44 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-LABEL: insertelement_s_v16i8_s_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX8-NEXT:    s_mov_b32 s12, 0x80008
-; GFX8-NEXT:    s_movk_i32 s10, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s13, s0, s12
-; GFX8-NEXT:    s_and_b32 s11, s0, s10
-; GFX8-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX8-NEXT:    s_or_b32 s11, s11, s13
-; GFX8-NEXT:    s_mov_b32 s13, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s11, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 24
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s13
+; GFX8-NEXT:    s_and_b32 s10, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX8-NEXT:    s_or_b32 s10, s10, s11
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_or_b32 s0, s11, s0
+; GFX8-NEXT:    s_or_b32 s0, s10, s0
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX8-NEXT:    s_bfe_u32 s11, s1, s12
+; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
-; GFX8-NEXT:    s_and_b32 s6, s1, s10
-; GFX8-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s13
-; GFX8-NEXT:    s_or_b32 s6, s6, s11
+; GFX8-NEXT:    s_and_b32 s6, s1, 0xff
+; GFX8-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX8-NEXT:    s_or_b32 s6, s6, s10
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s6, s1
 ; GFX8-NEXT:    s_lshl_b32 s6, s7, 24
-; GFX8-NEXT:    s_bfe_u32 s7, s2, s12
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
-; GFX8-NEXT:    s_and_b32 s6, s2, s10
+; GFX8-NEXT:    s_and_b32 s6, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX8-NEXT:    s_bfe_u32 s2, s2, s13
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX8-NEXT:    s_or_b32 s6, s6, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s2, s6, s2
 ; GFX8-NEXT:    s_lshl_b32 s6, s8, 24
-; GFX8-NEXT:    s_bfe_u32 s7, s3, s12
+; GFX8-NEXT:    s_bfe_u32 s7, s3, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 24
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
-; GFX8-NEXT:    s_and_b32 s6, s3, s10
+; GFX8-NEXT:    s_and_b32 s6, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX8-NEXT:    s_bfe_u32 s3, s3, s13
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX8-NEXT:    s_or_b32 s6, s6, s7
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX8-NEXT:    s_or_b32 s3, s6, s3
@@ -3927,9 +3874,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_cselect_b32 s7, s3, s7
 ; GFX8-NEXT:    s_and_b32 s5, s5, 3
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX8-NEXT:    s_and_b32 s4, s4, s10
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX8-NEXT:    s_lshl_b32 s5, s10, s5
+; GFX8-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; GFX8-NEXT:    s_andn2_b32 s5, s7, s5
 ; GFX8-NEXT:    s_or_b32 s4, s5, s4
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
@@ -3940,41 +3887,41 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX8-NEXT:    s_bfe_u32 s9, s0, s12
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX8-NEXT:    s_and_b32 s8, s0, s10
+; GFX8-NEXT:    s_and_b32 s8, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s13
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX8-NEXT:    s_or_b32 s8, s8, s9
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s8, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX8-NEXT:    s_bfe_u32 s8, s1, s12
+; GFX8-NEXT:    s_bfe_u32 s8, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s4
-; GFX8-NEXT:    s_and_b32 s4, s1, s10
+; GFX8-NEXT:    s_and_b32 s4, s1, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s13
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s4, s1
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 24
-; GFX8-NEXT:    s_bfe_u32 s5, s2, s12
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
-; GFX8-NEXT:    s_and_b32 s4, s2, s10
+; GFX8-NEXT:    s_and_b32 s4, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    s_bfe_u32 s2, s2, s13
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX8-NEXT:    s_or_b32 s4, s4, s5
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s2, s4, s2
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 24
-; GFX8-NEXT:    s_bfe_u32 s5, s3, s12
+; GFX8-NEXT:    s_bfe_u32 s5, s3, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 24
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
-; GFX8-NEXT:    s_and_b32 s4, s3, s10
+; GFX8-NEXT:    s_and_b32 s4, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    s_bfe_u32 s3, s3, s13
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX8-NEXT:    s_or_b32 s4, s4, s5
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX8-NEXT:    s_or_b32 s3, s4, s3
@@ -3990,45 +3937,42 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-LABEL: insertelement_s_v16i8_s_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s12, 0x80008
-; GFX7-NEXT:    s_movk_i32 s10, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s13, s0, s12
-; GFX7-NEXT:    s_and_b32 s11, s0, s10
-; GFX7-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX7-NEXT:    s_or_b32 s11, s11, s13
-; GFX7-NEXT:    s_mov_b32 s13, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s6, s0, 24
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s13
+; GFX7-NEXT:    s_and_b32 s10, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX7-NEXT:    s_or_b32 s10, s10, s11
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s11, s0
+; GFX7-NEXT:    s_or_b32 s0, s10, s0
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX7-NEXT:    s_bfe_u32 s11, s1, s12
+; GFX7-NEXT:    s_bfe_u32 s10, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s6
-; GFX7-NEXT:    s_and_b32 s6, s1, s10
-; GFX7-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s13
-; GFX7-NEXT:    s_or_b32 s6, s6, s11
+; GFX7-NEXT:    s_and_b32 s6, s1, 0xff
+; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX7-NEXT:    s_or_b32 s6, s6, s10
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s6, s1
 ; GFX7-NEXT:    s_lshl_b32 s6, s7, 24
-; GFX7-NEXT:    s_bfe_u32 s7, s2, s12
+; GFX7-NEXT:    s_bfe_u32 s7, s2, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s6
-; GFX7-NEXT:    s_and_b32 s6, s2, s10
+; GFX7-NEXT:    s_and_b32 s6, s2, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX7-NEXT:    s_bfe_u32 s2, s2, s13
+; GFX7-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s2, s6, s2
 ; GFX7-NEXT:    s_lshl_b32 s6, s8, 24
-; GFX7-NEXT:    s_bfe_u32 s7, s3, s12
+; GFX7-NEXT:    s_bfe_u32 s7, s3, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s9, s3, 24
 ; GFX7-NEXT:    s_or_b32 s2, s2, s6
-; GFX7-NEXT:    s_and_b32 s6, s3, s10
+; GFX7-NEXT:    s_and_b32 s6, s3, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX7-NEXT:    s_bfe_u32 s3, s3, s13
+; GFX7-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s3, s6, s3
@@ -4043,9 +3987,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_cselect_b32 s7, s3, s7
 ; GFX7-NEXT:    s_and_b32 s5, s5, 3
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX7-NEXT:    s_and_b32 s4, s4, s10
+; GFX7-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX7-NEXT:    s_lshl_b32 s5, s10, s5
+; GFX7-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; GFX7-NEXT:    s_andn2_b32 s5, s7, s5
 ; GFX7-NEXT:    s_or_b32 s4, s5, s4
 ; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
@@ -4056,41 +4000,41 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_cselect_b32 s2, s4, s2
 ; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX7-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX7-NEXT:    s_bfe_u32 s14, s5, s12
+; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s4, s5, 24
-; GFX7-NEXT:    s_and_b32 s11, s5, s10
-; GFX7-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX7-NEXT:    s_bfe_u32 s5, s5, s13
-; GFX7-NEXT:    s_or_b32 s11, s11, s14
+; GFX7-NEXT:    s_and_b32 s10, s5, 0xff
+; GFX7-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x80010
+; GFX7-NEXT:    s_or_b32 s10, s10, s11
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX7-NEXT:    s_or_b32 s5, s11, s5
+; GFX7-NEXT:    s_or_b32 s5, s10, s5
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX7-NEXT:    s_bfe_u32 s11, s7, s12
+; GFX7-NEXT:    s_bfe_u32 s10, s7, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s6, s7, 24
 ; GFX7-NEXT:    s_or_b32 s4, s5, s4
-; GFX7-NEXT:    s_and_b32 s5, s7, s10
-; GFX7-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX7-NEXT:    s_bfe_u32 s7, s7, s13
-; GFX7-NEXT:    s_or_b32 s5, s5, s11
+; GFX7-NEXT:    s_and_b32 s5, s7, 0xff
+; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX7-NEXT:    s_bfe_u32 s7, s7, 0x80010
+; GFX7-NEXT:    s_or_b32 s5, s5, s10
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX7-NEXT:    s_or_b32 s5, s5, s7
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX7-NEXT:    s_bfe_u32 s7, s2, s12
+; GFX7-NEXT:    s_bfe_u32 s7, s2, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX7-NEXT:    s_or_b32 s5, s5, s6
-; GFX7-NEXT:    s_and_b32 s6, s2, s10
+; GFX7-NEXT:    s_and_b32 s6, s2, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX7-NEXT:    s_bfe_u32 s2, s2, s13
+; GFX7-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s2, s6, s2
 ; GFX7-NEXT:    s_lshl_b32 s6, s8, 24
-; GFX7-NEXT:    s_bfe_u32 s7, s3, s12
+; GFX7-NEXT:    s_bfe_u32 s7, s3, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s9, s3, 24
 ; GFX7-NEXT:    s_or_b32 s6, s2, s6
-; GFX7-NEXT:    s_and_b32 s2, s3, s10
+; GFX7-NEXT:    s_and_b32 s2, s3, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX7-NEXT:    s_bfe_u32 s3, s3, s13
+; GFX7-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX7-NEXT:    s_or_b32 s2, s2, s7
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
@@ -4109,112 +4053,109 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-LABEL: insertelement_s_v16i8_s_s:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s7, 0x80008
-; GFX10-NEXT:    s_movk_i32 s6, 0xff
-; GFX10-NEXT:    s_mov_b32 s8, 0x80010
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s14, s0, s7
-; GFX10-NEXT:    s_lshr_b32 s9, s0, 24
-; GFX10-NEXT:    s_and_b32 s13, s0, s6
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s8
-; GFX10-NEXT:    s_bfe_u32 s16, s1, s7
-; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 24
-; GFX10-NEXT:    s_and_b32 s15, s1, s6
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s8
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_lshl_b32 s16, s16, 8
-; GFX10-NEXT:    s_or_b32 s13, s13, s14
-; GFX10-NEXT:    s_bfe_u32 s18, s2, s7
-; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
+; GFX10-NEXT:    s_bfe_u32 s11, s0, 0x80008
+; GFX10-NEXT:    s_bfe_u32 s13, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX10-NEXT:    s_and_b32 s10, s0, 0xff
+; GFX10-NEXT:    s_and_b32 s12, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s14, s15, s16
-; GFX10-NEXT:    s_or_b32 s0, s13, s0
-; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX10-NEXT:    s_and_b32 s17, s2, s6
-; GFX10-NEXT:    s_lshl_b32 s10, s10, 24
-; GFX10-NEXT:    s_or_b32 s1, s14, s1
-; GFX10-NEXT:    s_or_b32 s0, s0, s9
-; GFX10-NEXT:    s_lshl_b32 s9, s18, 8
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s8
-; GFX10-NEXT:    s_or_b32 s9, s17, s9
+; GFX10-NEXT:    s_or_b32 s10, s10, s11
+; GFX10-NEXT:    s_or_b32 s11, s12, s13
+; GFX10-NEXT:    s_bfe_u32 s15, s2, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 24
+; GFX10-NEXT:    s_or_b32 s1, s11, s1
+; GFX10-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX10-NEXT:    s_and_b32 s14, s2, 0xff
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX10-NEXT:    s_lshl_b32 s15, s15, 8
+; GFX10-NEXT:    s_or_b32 s0, s10, s0
+; GFX10-NEXT:    s_or_b32 s1, s1, s7
+; GFX10-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s9, s3, 24
+; GFX10-NEXT:    s_or_b32 s12, s14, s15
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX10-NEXT:    s_or_b32 s1, s1, s10
-; GFX10-NEXT:    s_bfe_u32 s10, s3, s7
-; GFX10-NEXT:    s_lshr_b32 s12, s3, 24
-; GFX10-NEXT:    s_or_b32 s2, s9, s2
-; GFX10-NEXT:    s_lshl_b32 s9, s11, 24
-; GFX10-NEXT:    s_and_b32 s11, s3, s6
-; GFX10-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX10-NEXT:    s_bfe_u32 s3, s3, s8
-; GFX10-NEXT:    s_or_b32 s10, s11, s10
+; GFX10-NEXT:    s_lshl_b32 s6, s8, 24
+; GFX10-NEXT:    s_and_b32 s8, s3, 0xff
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x80010
+; GFX10-NEXT:    s_or_b32 s2, s12, s2
+; GFX10-NEXT:    s_or_b32 s7, s8, s7
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX10-NEXT:    s_or_b32 s2, s2, s9
-; GFX10-NEXT:    s_or_b32 s3, s10, s3
-; GFX10-NEXT:    s_lshl_b32 s9, s12, 24
-; GFX10-NEXT:    s_lshr_b32 s10, s5, 2
-; GFX10-NEXT:    s_or_b32 s3, s3, s9
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 1
-; GFX10-NEXT:    s_cselect_b32 s9, s1, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 2
-; GFX10-NEXT:    s_cselect_b32 s9, s2, s9
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 3
-; GFX10-NEXT:    s_cselect_b32 s9, s3, s9
+; GFX10-NEXT:    s_or_b32 s2, s2, s6
+; GFX10-NEXT:    s_or_b32 s3, s7, s3
+; GFX10-NEXT:    s_lshl_b32 s6, s9, 24
+; GFX10-NEXT:    s_lshr_b32 s7, s5, 2
+; GFX10-NEXT:    s_or_b32 s3, s3, s6
+; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
+; GFX10-NEXT:    s_cselect_b32 s6, s1, s0
+; GFX10-NEXT:    s_cmp_eq_u32 s7, 2
+; GFX10-NEXT:    s_cselect_b32 s6, s2, s6
+; GFX10-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX10-NEXT:    s_cselect_b32 s6, s3, s6
 ; GFX10-NEXT:    s_and_b32 s5, s5, 3
-; GFX10-NEXT:    s_and_b32 s4, s4, s6
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 3
-; GFX10-NEXT:    s_lshl_b32 s11, s6, s5
+; GFX10-NEXT:    s_lshl_b32 s8, 0xff, s5
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX10-NEXT:    s_andn2_b32 s5, s9, s11
+; GFX10-NEXT:    s_andn2_b32 s5, s6, s8
 ; GFX10-NEXT:    s_or_b32 s4, s5, s4
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX10-NEXT:    s_cselect_b32 s0, s4, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 1
+; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
 ; GFX10-NEXT:    s_cselect_b32 s1, s4, s1
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 2
+; GFX10-NEXT:    s_cmp_eq_u32 s7, 2
 ; GFX10-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX10-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX10-NEXT:    s_cmp_eq_u32 s7, 3
 ; GFX10-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX10-NEXT:    s_bfe_u32 s10, s0, s7
+; GFX10-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX10-NEXT:    s_and_b32 s11, s0, s6
-; GFX10-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s8
-; GFX10-NEXT:    s_or_b32 s10, s11, s10
+; GFX10-NEXT:    s_and_b32 s8, s0, 0xff
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_or_b32 s7, s8, s7
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 24
-; GFX10-NEXT:    s_or_b32 s0, s10, s0
-; GFX10-NEXT:    s_bfe_u32 s10, s1, s7
+; GFX10-NEXT:    s_or_b32 s0, s7, s0
+; GFX10-NEXT:    s_bfe_u32 s7, s1, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX10-NEXT:    s_and_b32 s12, s1, s6
-; GFX10-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s8
-; GFX10-NEXT:    s_or_b32 s10, s12, s10
+; GFX10-NEXT:    s_and_b32 s9, s1, 0xff
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_or_b32 s7, s9, s7
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10-NEXT:    s_lshl_b32 s4, s5, 24
-; GFX10-NEXT:    s_bfe_u32 s5, s2, s7
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 24
-; GFX10-NEXT:    s_or_b32 s1, s10, s1
-; GFX10-NEXT:    s_and_b32 s10, s2, s6
+; GFX10-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX10-NEXT:    s_or_b32 s1, s7, s1
+; GFX10-NEXT:    s_and_b32 s7, s2, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s8
-; GFX10-NEXT:    s_or_b32 s5, s10, s5
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT:    s_or_b32 s5, s7, s5
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s4
-; GFX10-NEXT:    s_bfe_u32 s4, s3, s7
-; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
+; GFX10-NEXT:    s_bfe_u32 s4, s3, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s8, s3, 24
 ; GFX10-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-NEXT:    s_and_b32 s5, s3, s6
+; GFX10-NEXT:    s_and_b32 s5, s3, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX10-NEXT:    s_bfe_u32 s3, s3, s8
+; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX10-NEXT:    s_or_b32 s4, s5, s4
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX10-NEXT:    s_lshl_b32 s5, s9, 24
+; GFX10-NEXT:    s_lshl_b32 s5, s6, 24
 ; GFX10-NEXT:    s_or_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshl_b32 s4, s11, 24
+; GFX10-NEXT:    s_lshl_b32 s4, s8, 24
 ; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
@@ -4235,19 +4176,18 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    s_movk_i32 s6, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v6, 8
-; GFX9-NEXT:    v_mov_b32_e32 v7, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s3, 2
+; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 8
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v8, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 2
 ; GFX9-NEXT:    s_and_b32 s3, s3, 3
-; GFX9-NEXT:    s_and_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX9-NEXT:    s_lshl_b32 s3, s6, s3
-; GFX9-NEXT:    s_not_b32 s5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    s_lshl_b32 s3, 0xff, s3
+; GFX9-NEXT:    s_not_b32 s6, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4258,57 +4198,59 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v13
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v15
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v13
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v17
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v6, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v14, v9
 ; GFX9-NEXT:    v_or3_b32 v1, v1, v16, v10
-; GFX9-NEXT:    v_and_or_b32 v13, v3, s6, v19
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v13, v3, v6, v19
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
 ; GFX9-NEXT:    v_or3_b32 v2, v2, v18, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v0, v1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, 2
+; GFX9-NEXT:    v_mov_b32_e32 v15, s2
 ; GFX9-NEXT:    v_or3_b32 v3, v13, v3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v2, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s5, 3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v3, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v8, v9, s5, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v14
+; GFX9-NEXT:    v_and_or_b32 v9, v9, s6, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s5, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v6, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v16
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v6, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
-; GFX9-NEXT:    v_or3_b32 v0, v0, v13, v8
-; GFX9-NEXT:    v_or3_b32 v1, v1, v15, v9
-; GFX9-NEXT:    v_or3_b32 v2, v2, v17, v10
-; GFX9-NEXT:    v_or3_b32 v3, v3, v7, v6
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v6, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v6, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v12
+; GFX9-NEXT:    v_or3_b32 v0, v0, v14, v9
+; GFX9-NEXT:    v_or3_b32 v1, v1, v16, v10
+; GFX9-NEXT:    v_or3_b32 v2, v2, v18, v11
+; GFX9-NEXT:    v_or3_b32 v3, v3, v8, v6
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4319,14 +4261,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 16
-; GFX8-NEXT:    s_and_b32 s1, s3, 3
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_and_b32 s0, s3, 3
 ; GFX8-NEXT:    s_lshr_b32 s4, s3, 2
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX8-NEXT:    s_and_b32 s2, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX8-NEXT:    s_lshl_b32 s5, s1, s0
+; GFX8-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT:    s_lshl_b32 s5, s2, s1
 ; GFX8-NEXT:    s_not_b32 s6, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
@@ -4408,108 +4349,109 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s6, 0xff
-; GFX7-NEXT:    s_and_b32 s0, s3, 3
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX7-NEXT:    s_and_b32 s1, s3, 3
 ; GFX7-NEXT:    s_lshr_b32 s4, s3, 2
-; GFX7-NEXT:    s_and_b32 s1, s2, s6
-; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX7-NEXT:    s_lshl_b32 s5, s1, s0
-; GFX7-NEXT:    s_lshl_b32 s0, s6, s0
+; GFX7-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX7-NEXT:    s_lshl_b32 s5, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT:    s_not_b32 s7, s0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX7-NEXT:    s_not_b32 s6, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v8, s6, v0
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v9, s0, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v10, s6, v1
+; GFX7-NEXT:    v_and_b32_e32 v11, s0, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v12, s6, v2
+; GFX7-NEXT:    v_bfe_u32 v14, v2, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v13, v2, v4
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v15, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v16, v3, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v14, s6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v15, v3, v4
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v11, v14, v15
-; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX7-NEXT:    v_or_b32_e32 v3, v11, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX7-NEXT:    v_and_b32_e32 v4, s7, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, s5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
+; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, s5, v5
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v8, s6, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v9, v0, v4
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v10, s6, v1
-; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v12, s6, v2
+; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v14, v2, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v11, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v13, v2, v4
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
+; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -4520,11 +4462,10 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s3, 2
-; GFX10-NEXT:    s_and_b32 s2, s2, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
+; GFX10-NEXT:    s_lshr_b32 s4, s3, 2
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
@@ -4534,34 +4475,34 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v10
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v12
+; GFX10-NEXT:    v_and_or_b32 v1, v1, 0xff, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s4, v14
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v2, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v11, v6
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v13, v7
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v16
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v3, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
 ; GFX10-NEXT:    v_or3_b32 v2, v2, v15, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, 2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s4, 2
 ; GFX10-NEXT:    s_and_b32 s1, s3, 3
 ; GFX10-NEXT:    v_or3_b32 v3, v3, v10, v6
 ; GFX10-NEXT:    s_lshl_b32 s3, s1, 3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s5, 3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s4, 3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, v2, s0
-; GFX10-NEXT:    s_lshl_b32 s6, s4, s3
+; GFX10-NEXT:    s_lshl_b32 s5, 0xff, s3
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX10-NEXT:    s_not_b32 s3, s6
+; GFX10-NEXT:    s_not_b32 s3, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s1
 ; GFX10-NEXT:    v_and_or_b32 v6, v6, s3, s2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s5, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
@@ -4578,13 +4519,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v10
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v1, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s4, v14
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v2, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v4
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
@@ -4604,47 +4545,46 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-LABEL: insertelement_s_v16i8_v_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s13, 0x80008
-; GFX9-NEXT:    s_movk_i32 s11, 0xff
-; GFX9-NEXT:    v_and_b32_e32 v0, s11, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    s_mov_b32 s5, 8
+; GFX9-NEXT:    s_mov_b32 s6, 16
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s14, s0, s13
-; GFX9-NEXT:    s_and_b32 s12, s0, s11
-; GFX9-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX9-NEXT:    s_or_b32 s12, s12, s14
-; GFX9-NEXT:    s_mov_b32 s14, 0x80010
+; GFX9-NEXT:    s_bfe_u32 s12, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s14
+; GFX9-NEXT:    s_and_b32 s11, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s12, s12, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_or_b32 s11, s11, s12
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_or_b32 s0, s12, s0
+; GFX9-NEXT:    s_or_b32 s0, s11, s0
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 24
-; GFX9-NEXT:    s_bfe_u32 s12, s1, s13
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
-; GFX9-NEXT:    s_and_b32 s7, s1, s11
-; GFX9-NEXT:    s_lshl_b32 s12, s12, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s14
-; GFX9-NEXT:    s_or_b32 s7, s7, s12
+; GFX9-NEXT:    s_and_b32 s7, s1, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT:    s_or_b32 s7, s7, s11
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s7, s1
 ; GFX9-NEXT:    s_lshl_b32 s7, s8, 24
-; GFX9-NEXT:    s_bfe_u32 s8, s2, s13
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s7
-; GFX9-NEXT:    s_and_b32 s7, s2, s11
+; GFX9-NEXT:    s_and_b32 s7, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX9-NEXT:    s_bfe_u32 s2, s2, s14
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s2, s7, s2
 ; GFX9-NEXT:    s_lshl_b32 s7, s9, 24
-; GFX9-NEXT:    s_bfe_u32 s8, s3, s13
+; GFX9-NEXT:    s_bfe_u32 s8, s3, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s10, s3, 24
 ; GFX9-NEXT:    s_or_b32 s2, s2, s7
-; GFX9-NEXT:    s_and_b32 s7, s3, s11
+; GFX9-NEXT:    s_and_b32 s7, s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX9-NEXT:    s_bfe_u32 s3, s3, s14
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX9-NEXT:    s_or_b32 s3, s7, s3
@@ -4659,48 +4599,47 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_cselect_b32 s8, s3, s8
 ; GFX9-NEXT:    s_and_b32 s4, s4, 3
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX9-NEXT:    s_lshl_b32 s9, s11, s4
+; GFX9-NEXT:    s_lshl_b32 s9, 0xff, s4
 ; GFX9-NEXT:    s_andn2_b32 s8, s8, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
+; GFX9-NEXT:    v_lshl_or_b32 v5, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 3
-; GFX9-NEXT:    s_mov_b32 s6, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v8, v0, s11, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v9, v0, v4, v9
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT:    v_or3_b32 v0, v8, v0, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v4, v1, s11, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_mov_b32_e32 v8, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GFX9-NEXT:    v_and_or_b32 v5, v2, s11, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v9, v0, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v5, v1, v4, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
-; GFX9-NEXT:    v_and_or_b32 v6, v3, s11, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_or3_b32 v1, v5, v1, v6
+; GFX9-NEXT:    v_mov_b32_e32 v5, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX9-NEXT:    v_and_or_b32 v6, v2, v4, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_or3_b32 v2, v6, v2, v7
+; GFX9-NEXT:    v_and_or_b32 v6, v3, v4, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
@@ -4710,47 +4649,44 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-LABEL: insertelement_s_v16i8_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX8-NEXT:    s_mov_b32 s11, 0x80008
-; GFX8-NEXT:    s_movk_i32 s9, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s12, s0, s11
-; GFX8-NEXT:    s_and_b32 s10, s0, s9
-; GFX8-NEXT:    s_lshl_b32 s12, s12, 8
-; GFX8-NEXT:    s_or_b32 s10, s10, s12
-; GFX8-NEXT:    s_mov_b32 s12, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s12
+; GFX8-NEXT:    s_and_b32 s9, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX8-NEXT:    s_or_b32 s9, s9, s10
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_or_b32 s0, s10, s0
+; GFX8-NEXT:    s_or_b32 s0, s9, s0
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX8-NEXT:    s_bfe_u32 s10, s1, s11
+; GFX8-NEXT:    s_bfe_u32 s9, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
-; GFX8-NEXT:    s_and_b32 s5, s1, s9
-; GFX8-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s12
-; GFX8-NEXT:    s_or_b32 s5, s5, s10
+; GFX8-NEXT:    s_and_b32 s5, s1, 0xff
+; GFX8-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX8-NEXT:    s_or_b32 s5, s5, s9
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s5, s1
 ; GFX8-NEXT:    s_lshl_b32 s5, s6, 24
-; GFX8-NEXT:    s_bfe_u32 s6, s2, s11
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s5
-; GFX8-NEXT:    s_and_b32 s5, s2, s9
+; GFX8-NEXT:    s_and_b32 s5, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    s_bfe_u32 s2, s2, s12
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX8-NEXT:    s_or_b32 s5, s5, s6
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s2, s5, s2
 ; GFX8-NEXT:    s_lshl_b32 s5, s7, 24
-; GFX8-NEXT:    s_bfe_u32 s6, s3, s11
+; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s8, s3, 24
 ; GFX8-NEXT:    s_or_b32 s2, s2, s5
-; GFX8-NEXT:    s_and_b32 s5, s3, s9
+; GFX8-NEXT:    s_and_b32 s5, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    s_bfe_u32 s3, s3, s12
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX8-NEXT:    s_or_b32 s5, s5, s6
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX8-NEXT:    s_or_b32 s3, s5, s3
@@ -4766,7 +4702,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s4, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    s_lshl_b32 s4, s9, s4
+; GFX8-NEXT:    s_lshl_b32 s4, 0xff, s4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_andn2_b32 s4, s6, s4
 ; GFX8-NEXT:    v_or_b32_e32 v4, s4, v0
@@ -4820,46 +4756,44 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-LABEL: insertelement_s_v16i8_v_s:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s11, 0x80008
-; GFX7-NEXT:    s_movk_i32 s9, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v0, s9, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s12, s0, s11
-; GFX7-NEXT:    s_and_b32 s10, s0, s9
-; GFX7-NEXT:    s_lshl_b32 s12, s12, 8
-; GFX7-NEXT:    s_or_b32 s10, s10, s12
-; GFX7-NEXT:    s_mov_b32 s12, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s12
+; GFX7-NEXT:    s_and_b32 s9, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX7-NEXT:    s_or_b32 s9, s9, s10
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX7-NEXT:    s_or_b32 s0, s10, s0
+; GFX7-NEXT:    s_or_b32 s0, s9, s0
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX7-NEXT:    s_bfe_u32 s10, s1, s11
+; GFX7-NEXT:    s_bfe_u32 s9, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s6, s1, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s5
-; GFX7-NEXT:    s_and_b32 s5, s1, s9
-; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s12
-; GFX7-NEXT:    s_or_b32 s5, s5, s10
+; GFX7-NEXT:    s_and_b32 s5, s1, 0xff
+; GFX7-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX7-NEXT:    s_or_b32 s5, s5, s9
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s5, s1
 ; GFX7-NEXT:    s_lshl_b32 s5, s6, 24
-; GFX7-NEXT:    s_bfe_u32 s6, s2, s11
+; GFX7-NEXT:    s_bfe_u32 s6, s2, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s7, s2, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s5
-; GFX7-NEXT:    s_and_b32 s5, s2, s9
+; GFX7-NEXT:    s_and_b32 s5, s2, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX7-NEXT:    s_bfe_u32 s2, s2, s12
+; GFX7-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s2, s5, s2
 ; GFX7-NEXT:    s_lshl_b32 s5, s7, 24
-; GFX7-NEXT:    s_bfe_u32 s6, s3, s11
+; GFX7-NEXT:    s_bfe_u32 s6, s3, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s8, s3, 24
 ; GFX7-NEXT:    s_or_b32 s2, s2, s5
-; GFX7-NEXT:    s_and_b32 s5, s3, s9
+; GFX7-NEXT:    s_and_b32 s5, s3, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX7-NEXT:    s_bfe_u32 s3, s3, s12
+; GFX7-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s3, s5, s3
@@ -4875,60 +4809,60 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s4, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX7-NEXT:    s_lshl_b32 s4, s9, s4
+; GFX7-NEXT:    s_lshl_b32 s4, 0xff, s4
 ; GFX7-NEXT:    s_andn2_b32 s4, s6, s4
-; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
+; GFX7-NEXT:    v_or_b32_e32 v5, s4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v8, s9, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v9, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s9, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
-; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s9, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, v2, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s9, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -4939,81 +4873,78 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-LABEL: insertelement_s_v16i8_v_s:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s6, 0x80008
-; GFX10-NEXT:    s_movk_i32 s5, 0xff
-; GFX10-NEXT:    s_mov_b32 s7, 0x80010
-; GFX10-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v12, 16
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s13, s0, s6
-; GFX10-NEXT:    s_bfe_u32 s15, s1, s6
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX10-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX10-NEXT:    s_and_b32 s12, s0, s5
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s7
-; GFX10-NEXT:    s_and_b32 s14, s1, s5
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s7
-; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX10-NEXT:    s_lshl_b32 s15, s15, 8
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX10-NEXT:    s_bfe_u32 s10, s0, 0x80008
+; GFX10-NEXT:    s_bfe_u32 s12, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX10-NEXT:    s_and_b32 s9, s0, 0xff
+; GFX10-NEXT:    s_and_b32 s11, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX10-NEXT:    s_lshl_b32 s12, s12, 8
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s12, s12, s13
-; GFX10-NEXT:    s_or_b32 s13, s14, s15
-; GFX10-NEXT:    s_bfe_u32 s17, s2, s6
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
-; GFX10-NEXT:    s_or_b32 s0, s12, s0
-; GFX10-NEXT:    s_or_b32 s1, s13, s1
-; GFX10-NEXT:    s_bfe_u32 s6, s3, s6
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
-; GFX10-NEXT:    s_and_b32 s16, s2, s5
-; GFX10-NEXT:    s_or_b32 s0, s0, s8
-; GFX10-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s7
-; GFX10-NEXT:    s_or_b32 s1, s1, s9
-; GFX10-NEXT:    s_and_b32 s9, s3, s5
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX10-NEXT:    s_bfe_u32 s3, s3, s7
-; GFX10-NEXT:    s_or_b32 s8, s16, s8
+; GFX10-NEXT:    s_or_b32 s9, s9, s10
+; GFX10-NEXT:    s_or_b32 s10, s11, s12
+; GFX10-NEXT:    s_bfe_u32 s14, s2, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX10-NEXT:    s_or_b32 s1, s10, s1
+; GFX10-NEXT:    s_lshr_b32 s7, s2, 24
+; GFX10-NEXT:    s_and_b32 s13, s2, 0xff
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX10-NEXT:    s_or_b32 s0, s9, s0
+; GFX10-NEXT:    s_or_b32 s1, s1, s6
+; GFX10-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s8, s3, 24
+; GFX10-NEXT:    s_or_b32 s11, s13, s14
+; GFX10-NEXT:    s_or_b32 s0, s0, s5
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX10-NEXT:    s_or_b32 s6, s9, s6
+; GFX10-NEXT:    s_lshl_b32 s5, s7, 24
+; GFX10-NEXT:    s_and_b32 s7, s3, 0xff
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x80010
+; GFX10-NEXT:    s_or_b32 s2, s11, s2
+; GFX10-NEXT:    s_or_b32 s6, s7, s6
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX10-NEXT:    s_or_b32 s2, s8, s2
-; GFX10-NEXT:    s_lshl_b32 s8, s10, 24
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    s_or_b32 s3, s6, s3
-; GFX10-NEXT:    s_lshl_b32 s6, s11, 24
-; GFX10-NEXT:    s_lshr_b32 s7, s4, 2
-; GFX10-NEXT:    s_or_b32 s2, s2, s8
-; GFX10-NEXT:    s_or_b32 s3, s3, s6
-; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s7, 0
-; GFX10-NEXT:    s_cselect_b32 s6, s1, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s7, 2
-; GFX10-NEXT:    s_cselect_b32 s6, s2, s6
-; GFX10-NEXT:    s_cmp_eq_u32 s7, 3
-; GFX10-NEXT:    s_cselect_b32 s6, s3, s6
+; GFX10-NEXT:    s_lshl_b32 s5, s8, 24
+; GFX10-NEXT:    s_lshr_b32 s6, s4, 2
+; GFX10-NEXT:    s_or_b32 s3, s3, s5
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s6, 0
+; GFX10-NEXT:    s_cselect_b32 s5, s1, s0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX10-NEXT:    s_cselect_b32 s5, s2, s5
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX10-NEXT:    s_cselect_b32 s5, s3, s5
 ; GFX10-NEXT:    s_and_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX10-NEXT:    s_lshl_b32 s8, s5, s4
-; GFX10-NEXT:    s_andn2_b32 s6, s6, s8
-; GFX10-NEXT:    v_lshl_or_b32 v4, v0, s4, s6
+; GFX10-NEXT:    s_lshl_b32 s7, 0xff, s4
+; GFX10-NEXT:    s_andn2_b32 s5, s5, s7
+; GFX10-NEXT:    v_lshl_or_b32 v4, v0, s4, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s7, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s6, 1
 ; GFX10-NEXT:    s_mov_b32 s1, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s7, 2
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s5, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s6, 2
+; GFX10-NEXT:    v_and_or_b32 v6, 0xff, v0, v6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s7, 3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s6, 3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
@@ -5021,15 +4952,15 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s5, v9
+; GFX10-NEXT:    v_and_or_b32 v9, 0xff, v1, v9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s5, v11
+; GFX10-NEXT:    v_and_or_b32 v11, 0xff, v2, v11
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v10, v3, s5, v10
+; GFX10-NEXT:    v_and_or_b32 v10, 0xff, v3, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_or3_b32 v0, v6, v0, v4
@@ -5050,60 +4981,58 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-LABEL: insertelement_s_v16i8_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s13, 0x80008
-; GFX9-NEXT:    s_movk_i32 s12, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_movk_i32 s12, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s14, s0, s13
-; GFX9-NEXT:    s_and_b32 s8, s0, s12
-; GFX9-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX9-NEXT:    s_or_b32 s8, s8, s14
-; GFX9-NEXT:    s_mov_b32 s14, 0x80010
+; GFX9-NEXT:    s_bfe_u32 s13, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s14
+; GFX9-NEXT:    s_and_b32 s8, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s13, s13, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_or_b32 s8, s8, s13
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s8, s0
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 24
 ; GFX9-NEXT:    s_or_b32 s8, s0, s5
-; GFX9-NEXT:    s_bfe_u32 s5, s1, s13
+; GFX9-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_and_b32 s0, s1, s12
+; GFX9-NEXT:    s_and_b32 s0, s1, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s14
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX9-NEXT:    s_or_b32 s0, s0, s5
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshl_b32 s1, s9, 24
 ; GFX9-NEXT:    s_or_b32 s9, s0, s1
-; GFX9-NEXT:    s_bfe_u32 s1, s2, s13
-; GFX9-NEXT:    s_and_b32 s0, s2, s12
+; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_bfe_u32 s1, s2, s14
+; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x80010
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshl_b32 s1, s10, 24
 ; GFX9-NEXT:    s_or_b32 s10, s0, s1
-; GFX9-NEXT:    s_bfe_u32 s1, s3, s13
-; GFX9-NEXT:    s_and_b32 s0, s3, s12
+; GFX9-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX9-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_bfe_u32 s1, s3, s14
+; GFX9-NEXT:    s_bfe_u32 s1, s3, 0x80010
 ; GFX9-NEXT:    s_lshr_b32 s11, s3, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_or_b32 s11, s0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_and_b32 s4, s4, s12
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
@@ -5111,43 +5040,44 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX9-NEXT:    v_and_or_b32 v5, v1, v0, v2
+; GFX9-NEXT:    v_and_or_b32 v6, v1, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GFX9-NEXT:    s_mov_b32 s6, 8
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s7, 16
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_and_or_b32 v8, v0, s12, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_and_or_b32 v9, v0, v5, v9
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT:    v_or3_b32 v0, v8, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v9, v0, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v4, v1, s12, v4
+; GFX9-NEXT:    v_and_or_b32 v4, v1, v5, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_mov_b32_e32 v8, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GFX9-NEXT:    v_and_or_b32 v5, v2, s12, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
-; GFX9-NEXT:    v_and_or_b32 v6, v3, s12, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v6
+; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX9-NEXT:    v_and_or_b32 v6, v2, v5, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_or3_b32 v2, v6, v2, v7
+; GFX9-NEXT:    v_and_or_b32 v6, v3, v5, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
@@ -5157,60 +5087,58 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-LABEL: insertelement_s_v16i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX8-NEXT:    s_mov_b32 s13, 0x80008
-; GFX8-NEXT:    s_movk_i32 s12, 0xff
-; GFX8-NEXT:    s_mov_b32 s14, 0x80010
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_movk_i32 s12, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s9, s0, s13
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX8-NEXT:    s_and_b32 s8, s0, s12
+; GFX8-NEXT:    s_and_b32 s8, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s14
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX8-NEXT:    s_or_b32 s8, s8, s9
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s8, s0
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
 ; GFX8-NEXT:    s_or_b32 s8, s0, s5
-; GFX8-NEXT:    s_bfe_u32 s5, s1, s13
+; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 24
-; GFX8-NEXT:    s_and_b32 s0, s1, s12
+; GFX8-NEXT:    s_and_b32 s0, s1, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s14
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s6, 24
 ; GFX8-NEXT:    s_or_b32 s9, s0, s1
-; GFX8-NEXT:    s_bfe_u32 s1, s2, s13
-; GFX8-NEXT:    s_and_b32 s0, s2, s12
+; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_bfe_u32 s1, s2, s14
+; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX8-NEXT:    s_or_b32 s10, s0, s1
-; GFX8-NEXT:    s_bfe_u32 s1, s3, s13
-; GFX8-NEXT:    s_and_b32 s0, s3, s12
+; GFX8-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX8-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_bfe_u32 s1, s3, s14
+; GFX8-NEXT:    s_bfe_u32 s1, s3, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    s_or_b32 s11, s0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    s_and_b32 s4, s4, s12
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
@@ -5269,117 +5197,115 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-LABEL: insertelement_s_v16i8_s_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s13, 0x80008
-; GFX7-NEXT:    s_movk_i32 s12, 0xff
-; GFX7-NEXT:    s_mov_b32 s14, 0x80010
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s9, s0, s13
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX7-NEXT:    s_and_b32 s8, s0, s12
+; GFX7-NEXT:    s_and_b32 s8, s0, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s14
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_or_b32 s8, s8, s9
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s8, s0
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
 ; GFX7-NEXT:    s_or_b32 s8, s0, s5
-; GFX7-NEXT:    s_bfe_u32 s5, s1, s13
+; GFX7-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s6, s1, 24
-; GFX7-NEXT:    s_and_b32 s0, s1, s12
+; GFX7-NEXT:    s_and_b32 s0, s1, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s14
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX7-NEXT:    s_or_b32 s0, s0, s5
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s6, 24
 ; GFX7-NEXT:    s_or_b32 s9, s0, s1
-; GFX7-NEXT:    s_bfe_u32 s1, s2, s13
-; GFX7-NEXT:    s_and_b32 s0, s2, s12
+; GFX7-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_bfe_u32 s1, s2, s14
+; GFX7-NEXT:    s_bfe_u32 s1, s2, 0x80010
 ; GFX7-NEXT:    s_lshr_b32 s7, s2, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX7-NEXT:    s_or_b32 s10, s0, s1
-; GFX7-NEXT:    s_bfe_u32 s1, s3, s13
-; GFX7-NEXT:    s_and_b32 s0, s3, s12
+; GFX7-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX7-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_bfe_u32 s1, s3, s14
+; GFX7-NEXT:    s_bfe_u32 s1, s3, 0x80010
 ; GFX7-NEXT:    s_lshr_b32 s11, s3, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s9
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_or_b32 s11, s0, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT:    s_and_b32 s4, s4, s12
+; GFX7-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s12, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v5, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v8, s12, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, v0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s11
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s12, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_and_b32_e32 v4, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
-; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
+; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s12, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, v2, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
-; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s12, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v5
+; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -5390,54 +5316,51 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-LABEL: insertelement_s_v16i8_s_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s6, 0x80008
-; GFX10-NEXT:    s_movk_i32 s5, 0xff
-; GFX10-NEXT:    s_mov_b32 s7, 0x80010
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v12, 16
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s5
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xff
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s13, s0, s6
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX10-NEXT:    s_and_b32 s12, s0, s5
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s7
-; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX10-NEXT:    s_bfe_u32 s15, s1, s6
+; GFX10-NEXT:    s_bfe_u32 s9, s0, 0x80008
+; GFX10-NEXT:    s_bfe_u32 s12, s1, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX10-NEXT:    s_and_b32 s8, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX10-NEXT:    s_and_b32 s10, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s12, s12, 8
+; GFX10-NEXT:    s_bfe_u32 s14, s2, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s12, s12, s13
-; GFX10-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX10-NEXT:    s_and_b32 s14, s1, s5
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s7
-; GFX10-NEXT:    s_lshl_b32 s15, s15, 8
-; GFX10-NEXT:    s_bfe_u32 s17, s2, s6
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_or_b32 s0, s12, s0
+; GFX10-NEXT:    s_or_b32 s8, s8, s9
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s13, s14, s15
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX10-NEXT:    s_and_b32 s16, s2, s5
-; GFX10-NEXT:    s_or_b32 s8, s0, s8
-; GFX10-NEXT:    s_lshl_b32 s0, s17, 8
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s7
-; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
-; GFX10-NEXT:    s_or_b32 s1, s13, s1
-; GFX10-NEXT:    s_or_b32 s0, s16, s0
-; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX10-NEXT:    s_or_b32 s9, s1, s9
-; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshl_b32 s1, s10, 24
-; GFX10-NEXT:    s_bfe_u32 s2, s3, s6
+; GFX10-NEXT:    s_or_b32 s9, s10, s12
+; GFX10-NEXT:    s_lshr_b32 s7, s2, 24
+; GFX10-NEXT:    s_and_b32 s13, s2, 0xff
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX10-NEXT:    s_or_b32 s0, s8, s0
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX10-NEXT:    s_or_b32 s1, s9, s1
+; GFX10-NEXT:    s_or_b32 s10, s13, s14
+; GFX10-NEXT:    s_or_b32 s8, s0, s5
+; GFX10-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX10-NEXT:    s_or_b32 s9, s1, s6
+; GFX10-NEXT:    s_or_b32 s0, s10, s0
+; GFX10-NEXT:    s_lshl_b32 s1, s7, 24
+; GFX10-NEXT:    s_bfe_u32 s2, s3, 0x80008
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_and_b32 s6, s3, s5
+; GFX10-NEXT:    s_and_b32 s5, s3, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX10-NEXT:    s_or_b32 s10, s0, s1
-; GFX10-NEXT:    s_bfe_u32 s1, s3, s7
-; GFX10-NEXT:    s_or_b32 s0, s6, s2
+; GFX10-NEXT:    s_bfe_u32 s1, s3, 0x80010
+; GFX10-NEXT:    s_or_b32 s0, s5, s2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, s8, v1, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s1, s0, s1
@@ -5448,7 +5371,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, s0
 ; GFX10-NEXT:    s_or_b32 s11, s1, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v4
-; GFX10-NEXT:    s_and_b32 s2, s4, s5
+; GFX10-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s11, s1
@@ -5470,16 +5393,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s5, v6
+; GFX10-NEXT:    v_and_or_b32 v6, 0xff, v0, v6
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s5, v9
+; GFX10-NEXT:    v_and_or_b32 v9, 0xff, v1, v9
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s5, v11
+; GFX10-NEXT:    v_and_or_b32 v11, 0xff, v2, v11
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v10, v3, s5, v10
+; GFX10-NEXT:    v_and_or_b32 v10, 0xff, v3, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_or3_b32 v0, v6, v0, v4
@@ -5500,54 +5423,52 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-LABEL: insertelement_s_v16i8_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX9-NEXT:    s_mov_b32 s12, 0x80008
-; GFX9-NEXT:    s_movk_i32 s10, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX9-NEXT:    s_movk_i32 s10, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s13, s0, s12
-; GFX9-NEXT:    s_and_b32 s11, s0, s10
-; GFX9-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX9-NEXT:    s_or_b32 s11, s11, s13
-; GFX9-NEXT:    s_mov_b32 s13, 0x80010
+; GFX9-NEXT:    s_bfe_u32 s12, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s13
+; GFX9-NEXT:    s_and_b32 s11, s0, 0xff
+; GFX9-NEXT:    s_lshl_b32 s12, s12, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_or_b32 s11, s11, s12
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s11, s0
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX9-NEXT:    s_bfe_u32 s11, s1, s12
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX9-NEXT:    s_or_b32 s4, s0, s4
-; GFX9-NEXT:    s_and_b32 s0, s1, s10
+; GFX9-NEXT:    s_and_b32 s0, s1, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX9-NEXT:    s_or_b32 s0, s0, s11
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshl_b32 s1, s5, 24
 ; GFX9-NEXT:    s_or_b32 s5, s0, s1
-; GFX9-NEXT:    s_bfe_u32 s1, s2, s12
-; GFX9-NEXT:    s_and_b32 s0, s2, s10
+; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_bfe_u32 s1, s2, s13
+; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x80010
 ; GFX9-NEXT:    s_lshr_b32 s6, s2, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshl_b32 s1, s6, 24
 ; GFX9-NEXT:    s_or_b32 s6, s0, s1
-; GFX9-NEXT:    s_bfe_u32 s1, s3, s12
-; GFX9-NEXT:    s_and_b32 s0, s3, s10
+; GFX9-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX9-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_bfe_u32 s1, s3, s13
+; GFX9-NEXT:    s_bfe_u32 s1, s3, 0x80010
 ; GFX9-NEXT:    s_lshr_b32 s7, s3, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    s_or_b32 s7, s0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
@@ -5560,43 +5481,44 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX9-NEXT:    v_and_or_b32 v5, v2, v1, v0
+; GFX9-NEXT:    v_and_or_b32 v6, v2, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GFX9-NEXT:    s_mov_b32 s8, 8
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s9, 16
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_and_or_b32 v8, v0, s10, v8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_and_or_b32 v9, v0, v5, v9
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT:    v_or3_b32 v0, v8, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v9, v0, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v4, v1, s10, v4
+; GFX9-NEXT:    v_and_or_b32 v4, v1, v5, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_mov_b32_e32 v8, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GFX9-NEXT:    v_and_or_b32 v5, v2, s10, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
-; GFX9-NEXT:    v_and_or_b32 v6, v3, s10, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v6
+; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX9-NEXT:    v_and_or_b32 v6, v2, v5, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_or3_b32 v2, v6, v2, v7
+; GFX9-NEXT:    v_and_or_b32 v6, v3, v5, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
@@ -5606,54 +5528,52 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-LABEL: insertelement_s_v16i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX8-NEXT:    s_mov_b32 s10, 0x80008
-; GFX8-NEXT:    s_movk_i32 s8, 0xff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX8-NEXT:    s_movk_i32 s8, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s11, s0, s10
-; GFX8-NEXT:    s_and_b32 s9, s0, s8
-; GFX8-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX8-NEXT:    s_or_b32 s9, s9, s11
-; GFX8-NEXT:    s_mov_b32 s11, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s11
+; GFX8-NEXT:    s_and_b32 s9, s0, 0xff
+; GFX8-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX8-NEXT:    s_or_b32 s9, s9, s10
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s9, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX8-NEXT:    s_bfe_u32 s9, s1, s10
+; GFX8-NEXT:    s_bfe_u32 s9, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX8-NEXT:    s_or_b32 s4, s0, s4
-; GFX8-NEXT:    s_and_b32 s0, s1, s8
+; GFX8-NEXT:    s_and_b32 s0, s1, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s11
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s0, s0, s9
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s5, 24
 ; GFX8-NEXT:    s_or_b32 s5, s0, s1
-; GFX8-NEXT:    s_bfe_u32 s1, s2, s10
-; GFX8-NEXT:    s_and_b32 s0, s2, s8
+; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_bfe_u32 s1, s2, s11
+; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s6, 24
 ; GFX8-NEXT:    s_or_b32 s6, s0, s1
-; GFX8-NEXT:    s_bfe_u32 s1, s3, s10
-; GFX8-NEXT:    s_and_b32 s0, s3, s8
+; GFX8-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX8-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_bfe_u32 s1, s3, s11
+; GFX8-NEXT:    s_bfe_u32 s1, s3, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    s_or_b32 s7, s0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
@@ -5717,54 +5637,52 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-LABEL: insertelement_s_v16i8_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s10, 0x80008
-; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
+; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s11, s0, s10
-; GFX7-NEXT:    s_and_b32 s9, s0, s8
-; GFX7-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX7-NEXT:    s_or_b32 s9, s9, s11
-; GFX7-NEXT:    s_mov_b32 s11, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s11
+; GFX7-NEXT:    s_and_b32 s9, s0, 0xff
+; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX7-NEXT:    s_or_b32 s9, s9, s10
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s9, s0
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX7-NEXT:    s_bfe_u32 s9, s1, s10
+; GFX7-NEXT:    s_bfe_u32 s9, s1, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX7-NEXT:    s_or_b32 s4, s0, s4
-; GFX7-NEXT:    s_and_b32 s0, s1, s8
+; GFX7-NEXT:    s_and_b32 s0, s1, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s11
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX7-NEXT:    s_or_b32 s0, s0, s9
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s5, 24
 ; GFX7-NEXT:    s_or_b32 s5, s0, s1
-; GFX7-NEXT:    s_bfe_u32 s1, s2, s10
-; GFX7-NEXT:    s_and_b32 s0, s2, s8
+; GFX7-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_bfe_u32 s1, s2, s11
+; GFX7-NEXT:    s_bfe_u32 s1, s2, 0x80010
 ; GFX7-NEXT:    s_lshr_b32 s6, s2, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s6, 24
 ; GFX7-NEXT:    s_or_b32 s6, s0, s1
-; GFX7-NEXT:    s_bfe_u32 s1, s3, s10
-; GFX7-NEXT:    s_and_b32 s0, s3, s8
+; GFX7-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX7-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_bfe_u32 s1, s3, s11
+; GFX7-NEXT:    s_bfe_u32 s1, s3, 0x80010
 ; GFX7-NEXT:    s_lshr_b32 s7, s3, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    s_or_b32 s7, s0, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
@@ -5779,55 +5697,56 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v6, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX7-NEXT:    v_and_b32_e32 v8, s8, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, v0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s8, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_and_b32_e32 v4, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
-; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
+; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s8, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, v2, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
-; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s8, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v5
+; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -5838,65 +5757,62 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-LABEL: insertelement_s_v16i8_v_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s7, 0x80008
-; GFX10-NEXT:    s_movk_i32 s8, 0xff
-; GFX10-NEXT:    s_mov_b32 s9, 0x80010
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v12, 16
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s8
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_u32 s12, s0, s7
+; GFX10-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX10-NEXT:    s_and_b32 s11, s0, s8
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s9
-; GFX10-NEXT:    s_lshl_b32 s12, s12, 8
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s11, s11, s12
-; GFX10-NEXT:    s_bfe_u32 s16, s2, s7
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX10-NEXT:    s_or_b32 s0, s11, s0
-; GFX10-NEXT:    s_bfe_u32 s14, s1, s7
-; GFX10-NEXT:    s_and_b32 s15, s2, s8
-; GFX10-NEXT:    s_lshl_b32 s16, s16, 8
-; GFX10-NEXT:    s_or_b32 s4, s0, s4
-; GFX10-NEXT:    s_bfe_u32 s0, s2, s9
+; GFX10-NEXT:    s_and_b32 s8, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_bfe_u32 s11, s1, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s9, s9, 8
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 24
-; GFX10-NEXT:    s_and_b32 s13, s1, s8
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s9
-; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX10-NEXT:    s_lshr_b32 s6, s2, 24
-; GFX10-NEXT:    s_or_b32 s2, s15, s16
+; GFX10-NEXT:    s_and_b32 s10, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX10-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX10-NEXT:    s_or_b32 s8, s8, s9
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 24
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s12, s13, s14
-; GFX10-NEXT:    s_or_b32 s0, s2, s0
-; GFX10-NEXT:    s_lshl_b32 s2, s6, 24
-; GFX10-NEXT:    s_or_b32 s1, s12, s1
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX10-NEXT:    s_or_b32 s6, s0, s2
-; GFX10-NEXT:    s_bfe_u32 s0, s3, s7
-; GFX10-NEXT:    s_or_b32 s5, s1, s5
-; GFX10-NEXT:    s_and_b32 s1, s3, s8
+; GFX10-NEXT:    s_or_b32 s9, s10, s11
+; GFX10-NEXT:    s_or_b32 s0, s8, s0
+; GFX10-NEXT:    s_or_b32 s1, s9, s1
+; GFX10-NEXT:    s_or_b32 s4, s0, s4
+; GFX10-NEXT:    s_lshl_b32 s0, s5, 24
+; GFX10-NEXT:    s_bfe_u32 s13, s2, 0x80008
+; GFX10-NEXT:    s_or_b32 s5, s1, s0
+; GFX10-NEXT:    s_bfe_u32 s0, s3, 0x80008
+; GFX10-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
-; GFX10-NEXT:    s_bfe_u32 s1, s3, s9
-; GFX10-NEXT:    s_lshr_b32 s10, s3, 24
+; GFX10-NEXT:    s_bfe_u32 s1, s3, 0x80010
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX10-NEXT:    s_and_b32 s12, s2, 0xff
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX10-NEXT:    s_or_b32 s10, s12, s13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v2, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s1, s0, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v4
-; GFX10-NEXT:    s_lshl_b32 s2, s10, 24
+; GFX10-NEXT:    s_lshr_b32 s7, s3, 24
+; GFX10-NEXT:    s_or_b32 s2, s10, s2
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 24
 ; GFX10-NEXT:    s_mov_b32 s3, 8
+; GFX10-NEXT:    s_or_b32 s6, s2, s6
+; GFX10-NEXT:    s_lshl_b32 s2, s7, 24
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s6, s0
 ; GFX10-NEXT:    s_or_b32 s7, s1, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s6, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s7, s1
 ; GFX10-NEXT:    v_and_or_b32 v5, v2, v1, v0
@@ -5917,16 +5833,16 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s8, v6
+; GFX10-NEXT:    v_and_or_b32 v6, 0xff, v0, v6
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s8, v9
+; GFX10-NEXT:    v_and_or_b32 v9, 0xff, v1, v9
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s8, v11
+; GFX10-NEXT:    v_and_or_b32 v11, 0xff, v2, v11
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v10, v3, s8, v10
+; GFX10-NEXT:    v_and_or_b32 v10, 0xff, v3, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_or3_b32 v0, v6, v0, v4
@@ -5948,171 +5864,172 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    s_movk_i32 s6, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    s_movk_i32 s3, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v13
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 2, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s3, v12
+; GFX9-NEXT:    v_and_or_b32 v12, v4, s3, v14
+; GFX9-NEXT:    v_and_or_b32 v14, v5, v0, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v17
-; GFX9-NEXT:    s_and_b32 s0, s2, s6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_or3_b32 v3, v3, v13, v8
+; GFX9-NEXT:    v_or3_b32 v8, v12, v15, v9
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v0, v18
+; GFX9-NEXT:    v_lshlrev_b32_e64 v18, v2, s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX9-NEXT:    v_or3_b32 v3, v3, v14, v9
-; GFX9-NEXT:    v_or3_b32 v4, v4, v16, v10
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    v_and_or_b32 v13, v6, s6, v19
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v17, v2, s0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v15
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
-; GFX9-NEXT:    v_or3_b32 v6, v13, v6, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v15
+; GFX9-NEXT:    v_or3_b32 v9, v14, v17, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v3, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, v2, v0
+; GFX9-NEXT:    v_or3_b32 v6, v6, v19, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v9, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v16
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v2, v9, v2, v17
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v2, v10, v2, v18
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
-; GFX9-NEXT:    v_and_or_b32 v10, v2, s6, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_and_or_b32 v8, v8, v0, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_and_or_b32 v9, v9, v0, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX9-NEXT:    v_or3_b32 v0, v1, v13, v3
-; GFX9-NEXT:    v_or3_b32 v1, v4, v15, v6
-; GFX9-NEXT:    v_or3_b32 v2, v5, v17, v9
-; GFX9-NEXT:    v_or3_b32 v3, v10, v18, v11
-; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
+; GFX9-NEXT:    v_and_or_b32 v13, v2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_or3_b32 v0, v3, v14, v6
+; GFX9-NEXT:    v_or3_b32 v1, v8, v16, v10
+; GFX9-NEXT:    v_or3_b32 v2, v9, v18, v11
+; GFX9-NEXT:    v_or3_b32 v3, v13, v7, v12
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, 8
-; GFX8-NEXT:    v_mov_b32_e32 v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v9, 8
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    s_and_b32 s1, s2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v8, 8
+; GFX8-NEXT:    v_mov_b32_e32 v7, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v9, 16
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 2, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v14, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 2, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v18
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX8-NEXT:    v_lshlrev_b32_e64 v17, v2, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v15
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v15
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e64 v18, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v15
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
+; GFX8-NEXT:    v_or_b32_e32 v7, v14, v17
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v19
+; GFX8-NEXT:    v_or_b32_e32 v3, v7, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v16
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v16
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v18
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[2:3]
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v13
-; GFX8-NEXT:    v_or_b32_e32 v11, v0, v15
-; GFX8-NEXT:    v_or_b32_e32 v12, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v10, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v11, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v12, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v9
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
+; GFX8-NEXT:    v_or_b32_e32 v9, v0, v9
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v10
+; GFX8-NEXT:    v_or_b32_e32 v3, v9, v8
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v16i8_s_v:
@@ -6121,108 +6038,109 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s6, 0xff
-; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 2, v2
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 2, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX7-NEXT:    s_and_b32 s0, s2, s6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX7-NEXT:    v_lshl_b32_e32 v18, s0, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v17
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s6, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v17
-; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v10, v3, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v12, v4, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v11, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
-; GFX7-NEXT:    v_and_b32_e32 v9, s6, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, s0, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v11, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v12, s0, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
-; GFX7-NEXT:    v_and_b32_e32 v13, s6, v5
+; GFX7-NEXT:    v_bfe_u32 v15, v5, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
+; GFX7-NEXT:    v_and_b32_e32 v14, v5, v7
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v17, v6, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v6
-; GFX7-NEXT:    v_and_b32_e32 v15, s6, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v16, v6, v7
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX7-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
+; GFX7-NEXT:    s_and_b32 s0, s2, 0xff
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
-; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX7-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
+; GFX7-NEXT:    v_lshl_b32_e32 v19, s0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v18
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v2, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[0:1]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v18
+; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
 ; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v18
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[0:1]
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_and_b32_e32 v8, s6, v0
+; GFX7-NEXT:    v_and_b32_e32 v9, v0, v7
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v10, s6, v1
+; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v11, v1, v7
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_bfe_u32 v14, v3, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v12, s6, v3
+; GFX7-NEXT:    v_and_b32_e32 v13, v3, v7
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v10, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -6233,7 +6151,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
@@ -6248,25 +6165,25 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v12
+; GFX10-NEXT:    v_and_or_b32 v3, v3, 0xff, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v14
+; GFX10-NEXT:    v_and_or_b32 v4, v4, 0xff, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s3, v16
+; GFX10-NEXT:    v_and_or_b32 v5, 0xff, v5, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX10-NEXT:    v_or3_b32 v3, v3, v13, v8
 ; GFX10-NEXT:    v_or3_b32 v4, v4, v15, v9
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v6, v6, s3, v18
+; GFX10-NEXT:    v_and_or_b32 v6, 0xff, v6, v18
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
 ; GFX10-NEXT:    v_or3_b32 v5, v5, v17, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v2
-; GFX10-NEXT:    s_and_b32 s1, s2, s3
-; GFX10-NEXT:    v_lshlrev_b32_e64 v10, v0, s3
+; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX10-NEXT:    v_lshlrev_b32_e64 v10, v0, 0xff
 ; GFX10-NEXT:    v_or3_b32 v6, v6, v12, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v9, v5, s0
@@ -6291,13 +6208,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s3, v10
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v2, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v12
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v3, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_and_or_b32 v12, v4, s3, v14
+; GFX10-NEXT:    v_and_or_b32 v12, 0xff, v4, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_and_or_b32 v14, v0, s3, v1
+; GFX10-NEXT:    v_and_or_b32 v14, 0xff, v0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
@@ -6319,95 +6236,95 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    s_movk_i32 s6, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 16
+; GFX9-NEXT:    s_movk_i32 s3, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 2
-; GFX9-NEXT:    s_and_b32 s2, s2, 3
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    s_lshl_b32 s2, s6, s2
-; GFX9-NEXT:    s_not_b32 s5, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v13
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    s_and_b32 s0, s2, 3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s3, v12
+; GFX9-NEXT:    v_and_or_b32 v12, v4, s3, v14
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v14, v5, v0, v16
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v17
+; GFX9-NEXT:    v_or3_b32 v3, v3, v13, v8
+; GFX9-NEXT:    v_or3_b32 v8, v12, v15, v9
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v0, v18
+; GFX9-NEXT:    s_not_b32 s5, s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX9-NEXT:    v_or3_b32 v3, v3, v14, v9
-; GFX9-NEXT:    v_or3_b32 v4, v4, v16, v10
-; GFX9-NEXT:    v_and_or_b32 v13, v6, s6, v19
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
+; GFX9-NEXT:    v_or3_b32 v9, v14, v17, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v3, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX9-NEXT:    v_or3_b32 v6, v13, v6, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v2, v9, s5, v2
+; GFX9-NEXT:    v_or3_b32 v6, v6, v19, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v9, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v2, v10, s5, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
-; GFX9-NEXT:    v_and_or_b32 v10, v2, s6, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_and_or_b32 v8, v8, v0, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_and_or_b32 v9, v9, v0, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX9-NEXT:    v_or3_b32 v0, v1, v13, v3
-; GFX9-NEXT:    v_or3_b32 v1, v4, v15, v6
-; GFX9-NEXT:    v_or3_b32 v2, v5, v17, v9
-; GFX9-NEXT:    v_or3_b32 v3, v10, v18, v11
-; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
+; GFX9-NEXT:    v_and_or_b32 v13, v2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_or3_b32 v0, v3, v14, v6
+; GFX9-NEXT:    v_or3_b32 v1, v8, v16, v10
+; GFX9-NEXT:    v_or3_b32 v2, v9, v18, v11
+; GFX9-NEXT:    v_or3_b32 v3, v13, v7, v12
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i8_v_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    s_and_b32 s1, s2, 3
+; GFX8-NEXT:    s_and_b32 s0, s2, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 8
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 8
-; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v11, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s4, s2, 2
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX8-NEXT:    s_not_b32 s5, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
@@ -6490,108 +6407,109 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_movk_i32 s6, 0xff
-; GFX7-NEXT:    v_and_b32_e32 v0, s6, v2
-; GFX7-NEXT:    s_and_b32 s0, s2, 3
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0xff
 ; GFX7-NEXT:    s_lshr_b32 s4, s2, 2
-; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
-; GFX7-NEXT:    s_lshl_b32 s0, s6, s0
+; GFX7-NEXT:    v_and_b32_e32 v2, v2, v7
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT:    s_not_b32 s5, s0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_u32 v10, v3, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v12, v4, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v4
-; GFX7-NEXT:    v_and_b32_e32 v9, s6, v3
+; GFX7-NEXT:    v_bfe_u32 v11, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v4, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
+; GFX7-NEXT:    v_and_b32_e32 v10, s0, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v11, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v12, s0, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
-; GFX7-NEXT:    v_and_b32_e32 v13, s6, v5
+; GFX7-NEXT:    v_bfe_u32 v15, v5, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
+; GFX7-NEXT:    v_and_b32_e32 v14, v5, v7
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v17, v6, 8, 8
+; GFX7-NEXT:    s_and_b32 s0, s2, 3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v6
-; GFX7-NEXT:    v_and_b32_e32 v15, s6, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v16, v6, v7
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
-; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX7-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s0, v2
+; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v1, v2, vcc
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX7-NEXT:    s_not_b32 s5, s0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[0:1]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
 ; GFX7-NEXT:    v_and_b32_e32 v5, s5, v5
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[0:1]
-; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v11, v2, 8, 8
-; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[2:3]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v8, s6, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 8
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v9, v0, v7
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v11, v1, v7
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v10, s6, v2
-; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_bfe_u32 v14, v3, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v12, s6, v3
+; GFX7-NEXT:    v_and_b32_e32 v13, v3, v7
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX7-NEXT:    v_or_b32_e32 v1, v8, v1
-; GFX7-NEXT:    v_or_b32_e32 v2, v9, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v2, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v10, v3
+; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -6602,10 +6520,9 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
+; GFX10-NEXT:    s_lshr_b32 s3, s2, 2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s3, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
@@ -6615,34 +6532,34 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v11
+; GFX10-NEXT:    v_and_or_b32 v3, v3, 0xff, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v13
+; GFX10-NEXT:    v_and_or_b32 v4, v4, 0xff, v13
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v6
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s3, v15
+; GFX10-NEXT:    v_and_or_b32 v5, 0xff, v5, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_or3_b32 v3, v3, v12, v7
 ; GFX10-NEXT:    v_or3_b32 v4, v4, v14, v8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v6, v6, s3, v17
+; GFX10-NEXT:    v_and_or_b32 v6, 0xff, v6, v17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v10
 ; GFX10-NEXT:    v_or3_b32 v5, v5, v16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s4, 2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, 2
 ; GFX10-NEXT:    s_and_b32 s1, s2, 3
 ; GFX10-NEXT:    v_or3_b32 v6, v6, v11, v7
 ; GFX10-NEXT:    s_lshl_b32 s2, s1, 3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s4, 3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s3, 3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v8, v5, s0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshl_b32 s2, 0xff, s2
 ; GFX10-NEXT:    s_not_b32 s2, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s1
 ; GFX10-NEXT:    v_and_or_b32 v2, v7, s2, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s3, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s0
@@ -6659,13 +6576,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v1, v3, s3, v10
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v3, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s3, v12
+; GFX10-NEXT:    v_and_or_b32 v6, 0xff, v4, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v10, v5, s3, v14
+; GFX10-NEXT:    v_and_or_b32 v10, 0xff, v5, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_and_or_b32 v12, v2, s3, v0
+; GFX10-NEXT:    v_and_or_b32 v12, 0xff, v2, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
@@ -6969,50 +6886,48 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    v_mov_b32_e32 v8, 8
+; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
-; GFX10-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v9, 16
+; GFX10-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s2, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_and_or_b32 v4, v4, 0xff, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX10-NEXT:    v_and_or_b32 v5, v5, 0xff, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s2, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v7
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_and_or_b32 v6, 0xff, v6, v17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 24, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v20, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_or_b32 v6, v6, v1, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX10-NEXT:    v_or3_b32 v4, v4, v15, v10
-; GFX10-NEXT:    v_or3_b32 v5, v5, v17, v11
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v7, v7, v1, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v13
-; GFX10-NEXT:    v_or3_b32 v6, v6, v19, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v4, v5, vcc_lo
+; GFX10-NEXT:    v_or3_b32 v4, v4, v14, v9
+; GFX10-NEXT:    v_or3_b32 v5, v5, v16, v10
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v7, 0xff, v7, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v12
+; GFX10-NEXT:    v_or3_b32 v6, v6, v18, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, v0, v1
-; GFX10-NEXT:    v_or3_b32 v7, v7, v14, v10
+; GFX10-NEXT:    v_lshlrev_b32_e64 v11, v0, 0xff
+; GFX10-NEXT:    v_or3_b32 v7, v7, v13, v9
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v11, v6, s0
-; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v12
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v7, s1
-; GFX10-NEXT:    v_and_or_b32 v0, v10, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v10, v6, s0
+; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v7, s1
+; GFX10-NEXT:    v_and_or_b32 v0, v9, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v0, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v0, s0
@@ -7020,29 +6935,29 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v1, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v3, v3, v1, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v2, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v5
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v3, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_and_or_b32 v13, v4, v1, v15
+; GFX10-NEXT:    v_and_or_b32 v12, 0xff, v4, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v8, v0, v1, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX10-NEXT:    v_and_or_b32 v14, 0xff, v0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    v_or3_b32 v0, v2, v12, v11
-; GFX10-NEXT:    v_or3_b32 v1, v3, v14, v6
-; GFX10-NEXT:    v_or3_b32 v2, v13, v16, v7
-; GFX10-NEXT:    v_or3_b32 v3, v8, v9, v10
+; GFX10-NEXT:    v_or3_b32 v0, v2, v11, v10
+; GFX10-NEXT:    v_or3_b32 v1, v3, v13, v6
+; GFX10-NEXT:    v_or3_b32 v2, v12, v15, v7
+; GFX10-NEXT:    v_or3_b32 v3, v14, v8, v9
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 114bfc0aa5153..ef3d6f6b479e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -1261,10 +1261,9 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)*
 ; GFX10-LABEL: test_div_scale_f32_undef_val_val:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41000000
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, s0
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
index 8803508a7ecdc..5135a791b9705 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
@@ -505,15 +505,14 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX10-LABEL: atomic_add_i32_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, s8
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v3, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -549,15 +548,14 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1
 ;
 ; GFX10-LABEL: atomic_add_i32_cube:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, s8
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v3, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -632,15 +630,14 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
 ;
 ; GFX10-LABEL: atomic_add_i32_2darray:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, s8
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v3, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -676,15 +673,14 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
 ;
 ; GFX10-LABEL: atomic_add_i32_2dmsaa:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, s8
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v3, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -720,14 +716,13 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %d
 ;
 ; GFX10-LABEL: atomic_add_i32_2darraymsaa:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v5, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v4
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v3, v4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -1279,15 +1274,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
 ;
 ; GFX10-LABEL: atomic_add_i64_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v5, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v5, s8
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v4, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -1323,15 +1317,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
 ;
 ; GFX10-LABEL: atomic_add_i64_cube:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v5, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v5, s8
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v4, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -1406,15 +1399,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
 ;
 ; GFX10-LABEL: atomic_add_i64_2darray:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v5, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v5, s8
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v4, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -1450,15 +1442,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
 ;
 ; GFX10-LABEL: atomic_add_i64_2dmsaa:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v5, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v5, s8
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v4, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -1494,14 +1485,13 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc,
 ;
 ; GFX10-LABEL: atomic_add_i64_2darraymsaa:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v6, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v5
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v4, v5
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
index 686658f24cad2..1bb1cc6db8f59 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -86,7 +86,6 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10NSA-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
@@ -100,8 +99,8 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10NSA-NEXT:    v_and_or_b32 v1, 0xffff, v2, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
@@ -143,7 +142,6 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10NSA-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
@@ -157,8 +155,8 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10NSA-NEXT:    v_and_or_b32 v1, 0xffff, v2, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
@@ -252,7 +250,6 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10NSA-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
@@ -266,8 +263,8 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10NSA-NEXT:    v_and_or_b32 v1, 0xffff, v2, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
@@ -309,7 +306,6 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10NSA-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
@@ -323,8 +319,8 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
-; GFX10NSA-NEXT:    v_and_or_b32 v2, v3, v4, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10NSA-NEXT:    v_and_or_b32 v2, 0xffff, v3, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
@@ -366,7 +362,6 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10NSA-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
@@ -380,8 +375,8 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, s12
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v3, v2
+; GFX10NSA-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
@@ -423,7 +418,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10NSA-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
@@ -437,8 +431,8 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v4, s12
-; GFX10NSA-NEXT:    v_and_or_b32 v2, v2, v4, v3
+; GFX10NSA-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v2, 0xffff, v2, v3
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
@@ -481,7 +475,6 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10NSA-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
@@ -495,9 +488,9 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v4, s12
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
-; GFX10NSA-NEXT:    v_and_or_b32 v2, v3, v4, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10NSA-NEXT:    v_and_or_b32 v2, 0xffff, v3, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
@@ -540,7 +533,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10NSA-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
@@ -554,9 +546,9 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v5, s12
-; GFX10NSA-NEXT:    v_and_or_b32 v2, v2, v5, v3
-; GFX10NSA-NEXT:    v_and_or_b32 v3, v4, v5, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v2, 0xffff, v2, v3
+; GFX10NSA-NEXT:    v_and_or_b32 v3, 0xffff, v4, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
@@ -592,7 +584,6 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ;
 ; GFX10NSA-LABEL: gather4_l_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
@@ -601,8 +592,8 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s8, s10
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10NSA-NEXT:    v_and_or_b32 v1, 0xffff, v2, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
@@ -643,7 +634,6 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10NSA-LABEL: gather4_c_l_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
@@ -652,8 +642,8 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10NSA-NEXT:    s_mov_b32 s8, s10
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
-; GFX10NSA-NEXT:    v_and_or_b32 v2, v3, v4, s12
+; GFX10NSA-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10NSA-NEXT:    v_and_or_b32 v2, 0xffff, v3, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index 88b3244df5fab..3201adf3bd8d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -564,15 +564,14 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v3, s0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <3 x half> %v

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
index cb596f2021e4a..e5108c53440e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
@@ -24,14 +24,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc,
 ;
 ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v4, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v4, v3
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -79,17 +78,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v5
-; GFX10-NEXT:    v_and_or_b32 v10, v0, v4, v1
-; GFX10-NEXT:    v_and_or_b32 v11, v2, v4, v3
-; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    v_and_or_b32 v10, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v11, 0xffff, v2, v3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
@@ -148,17 +146,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v5
-; GFX10-NEXT:    v_and_or_b32 v10, v0, v4, v1
-; GFX10-NEXT:    v_and_or_b32 v11, v2, v4, v3
-; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    v_and_or_b32 v10, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v11, 0xffff, v2, v3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index dcc3137545e97..533c449de47e1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -24,15 +24,14 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
 ;
 ; GFX10-LABEL: load_3d_v4f32_xyzw:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v3, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v3, s8
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v2, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -79,7 +78,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
@@ -90,8 +88,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v5
-; GFX10-NEXT:    v_and_or_b32 v10, v0, v3, v1
-; GFX10-NEXT:    v_and_or_b32 v11, v2, v3, s8
+; GFX10-NEXT:    v_and_or_b32 v10, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v11, 0xffff, v2, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
@@ -148,7 +146,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
@@ -159,8 +156,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v5
-; GFX10-NEXT:    v_and_or_b32 v10, v0, v3, v1
-; GFX10-NEXT:    v_and_or_b32 v11, v2, v3, s8
+; GFX10-NEXT:    v_and_or_b32 v10, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v11, 0xffff, v2, s8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index 7705bb2392ed1..ad20d9d5e3433 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -4,10 +4,9 @@
 define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_d_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v3, s12
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v3, s12
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -19,11 +18,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v6, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v6, v3
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -37,14 +35,13 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v3, v9, v11, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v0, v11, v1
-; GFX10-NEXT:    v_and_or_b32 v4, v10, v11, v4
-; GFX10-NEXT:    v_and_or_b32 v5, v5, v11, s12
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v9, s12
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v4, 0xffff, v10, v4
+; GFX10-NEXT:    v_and_or_b32 v5, 0xffff, v5, s12
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -56,10 +53,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_c_d_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v4, s12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, s12
 ; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -71,11 +67,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v3, v4
 ; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -87,10 +82,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_d_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v4, s12
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, s12
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
 ; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -102,11 +96,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v7, v3
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -118,10 +111,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_c_d_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v5, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v5, s12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, s12
 ; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -136,11 +128,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX10-NEXT:    v_and_or_b32 v4, v9, v0, v4
-; GFX10-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX10-NEXT:    v_and_or_b32 v4, 0xffff, v9, v4
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
 ; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -152,10 +143,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_cd_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v3, s12
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v3, s12
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
 ; GFX10-NEXT:    image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -167,11 +157,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v6, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v6, v3
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -183,10 +172,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_c_cd_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v4, s12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, s12
 ; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -198,11 +186,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v3, v4
 ; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -214,10 +201,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_cd_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v4, s12
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, s12
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
 ; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -229,11 +215,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v7, v3
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -245,10 +230,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_c_cd_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v5, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v5, s12
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, s12
 ; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -263,11 +247,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX10-NEXT:    v_and_or_b32 v4, v9, v0, v4
-; GFX10-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX10-NEXT:    v_and_or_b32 v4, 0xffff, v9, v4
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
 ; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -284,11 +267,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v4, v10, v0, v1
-; GFX10-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    v_and_or_b32 v4, 0xffff, v10, v0
+; GFX10-NEXT:    v_and_or_b32 v5, 0xffff, v11, v1
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -305,11 +287,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v4, v10, v0, v1
-; GFX10-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    v_and_or_b32 v4, 0xffff, v10, v0
+; GFX10-NEXT:    v_and_or_b32 v5, 0xffff, v11, v1
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 15755f4455cb5..80bc1114de884 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -49,15 +49,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh_intersect_ray_a16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v10, s4, v7
-; GCN-NEXT:    v_and_b32_e32 v8, s4, v8
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GCN-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GCN-NEXT:    v_and_or_b32 v5, v5, s4, v9
-; GCN-NEXT:    v_and_or_b32 v6, v6, s4, v10
+; GCN-NEXT:    v_and_or_b32 v5, v5, 0xffff, v9
+; GCN-NEXT:    v_and_or_b32 v6, v6, 0xffff, v10
 ; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
@@ -101,15 +100,14 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh64_intersect_ray_a16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v11, s4, v8
-; GCN-NEXT:    v_and_b32_e32 v9, s4, v9
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GCN-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; GCN-NEXT:    v_and_or_b32 v6, v6, s4, v10
-; GCN-NEXT:    v_and_or_b32 v7, v7, s4, v11
+; GCN-NEXT:    v_and_or_b32 v6, v6, 0xffff, v10
+; GCN-NEXT:    v_and_or_b32 v7, v7, 0xffff, v11
 ; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
@@ -202,21 +200,20 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX1030-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v14, v1
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v7
+; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v7
 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v2
-; GFX1030-NEXT:    v_and_b32_e32 v2, s0, v8
+; GFX1030-NEXT:    v_and_b32_e32 v2, 0xffff, v8
 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v3
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v4
 ; GFX1030-NEXT:    v_alignbit_b32 v20, v2, v7, 16
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1030-NEXT:    v_and_or_b32 v18, v5, s0, v0
-; GFX1030-NEXT:    v_and_or_b32 v19, v6, s0, v1
+; GFX1030-NEXT:    v_and_or_b32 v18, v5, 0xffff, v0
+; GFX1030-NEXT:    v_and_or_b32 v19, v6, 0xffff, v1
 ; GFX1030-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v10
@@ -246,16 +243,15 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX1013-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX1013-NEXT:    v_and_b32_e32 v14, s0, v7
-; GFX1013-NEXT:    v_and_b32_e32 v8, s0, v8
+; GFX1013-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX1013-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX1013-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX1013-NEXT:    v_and_or_b32 v5, v5, s0, v13
-; GFX1013-NEXT:    v_and_or_b32 v6, v6, s0, v14
+; GFX1013-NEXT:    v_and_or_b32 v5, v5, 0xffff, v13
+; GFX1013-NEXT:    v_and_or_b32 v6, v6, 0xffff, v14
 ; GFX1013-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v10
@@ -371,21 +367,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
 ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX1030-NEXT:    v_mov_b32_e32 v14, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v1
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v8
+; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v8
 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v2
-; GFX1030-NEXT:    v_and_b32_e32 v2, s0, v9
+; GFX1030-NEXT:    v_and_b32_e32 v2, 0xffff, v9
 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v3
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1030-NEXT:    v_mov_b32_e32 v18, v4
 ; GFX1030-NEXT:    v_mov_b32_e32 v19, v5
 ; GFX1030-NEXT:    v_alignbit_b32 v22, v2, v8, 16
-; GFX1030-NEXT:    v_and_or_b32 v20, v6, s0, v0
-; GFX1030-NEXT:    v_and_or_b32 v21, v7, s0, v1
+; GFX1030-NEXT:    v_and_or_b32 v20, v6, 0xffff, v0
+; GFX1030-NEXT:    v_and_or_b32 v21, v7, 0xffff, v1
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1030-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v10
@@ -417,20 +412,19 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ;
 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX1013-NEXT:    v_mov_b32_e32 v16, v10
 ; GFX1013-NEXT:    v_mov_b32_e32 v17, v11
 ; GFX1013-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX1013-NEXT:    v_and_b32_e32 v11, s0, v8
-; GFX1013-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1013-NEXT:    v_and_b32_e32 v11, 0xffff, v8
+; GFX1013-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX1013-NEXT:    v_mov_b32_e32 v18, v12
 ; GFX1013-NEXT:    v_mov_b32_e32 v19, v13
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX1013-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1013-NEXT:    v_and_or_b32 v6, v6, s0, v10
-; GFX1013-NEXT:    v_and_or_b32 v7, v7, s0, v11
+; GFX1013-NEXT:    v_and_or_b32 v6, v6, 0xffff, v10
+; GFX1013-NEXT:    v_and_or_b32 v7, v7, 0xffff, v11
 ; GFX1013-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index fa0d86214e65e..723b9d8140db7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -752,14 +752,13 @@ define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GFX6-LABEL: bfe_8_bfe_8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; GFX6-NEXT:    s_mov_b32 s4, 0x80000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_bfe_i32 s3, s3, s4
-; GFX6-NEXT:    s_bfe_i32 s3, s3, s4
+; GFX6-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX6-NEXT:    s_bfe_i32 s3, s3, 0x80000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 866bae4b34008..6a6b12108620f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -66,17 +66,16 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    s_movk_i32 s5, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s5, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, s5, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, s5, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_e32 v5, s5, v6
-; GFX10-NEXT:    v_and_b32_e32 v6, s5, v7
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v3, v4, s5, v3
+; GFX10-NEXT:    v_and_or_b32 v3, v4, 0xff, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index ffcc4ed7d38f2..6b28f32ba0739 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -66,17 +66,16 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    s_movk_i32 s5, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s5, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, s5, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, s5, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_e32 v5, s5, v6
-; GFX10-NEXT:    v_and_b32_e32 v6, s5, v7
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v3, v4, s5, v3
+; GFX10-NEXT:    v_and_or_b32 v3, v4, 0xff, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index c2f624f3a2ee2..107a95da9d169 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -115,9 +115,8 @@ define i24 @v_lshr_i24(i24 %value, i24 %amount) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b32 s4, 0xffffff
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i24 %value, %amount
@@ -631,9 +630,8 @@ define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) {
 define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) {
 ; GFX6-LABEL: lshr_i16_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v0, s1, v0
-; GFX6-NEXT:    s_and_b32 s0, s0, s1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -659,9 +657,8 @@ define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) {
 define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) {
 ; GFX6-LABEL: lshr_i16_vs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s0, s0, s1
-; GFX6-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -757,9 +754,8 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
 define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: s_lshr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -768,14 +764,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ;
 ; GFX8-LABEL: s_lshr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
-; GFX8-NEXT:    s_lshr_b32 s1, s2, s4
+; GFX8-NEXT:    s_lshr_b32 s1, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -808,10 +803,10 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX6-NEXT:    s_and_b32 s0, s1, s2
+; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -844,10 +839,10 @@ define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
 ; GFX6-LABEL: lshr_v2i16_vs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s1, s2
+; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -944,13 +939,12 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
 ; GFX6-LABEL: s_lshr_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
-; GFX6-NEXT:    s_and_b32 s0, s0, s8
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s5
-; GFX6-NEXT:    s_and_b32 s3, s3, s8
+; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s4
-; GFX6-NEXT:    s_and_b32 s2, s2, s8
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s7
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s6
@@ -961,36 +955,34 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ;
 ; GFX8-LABEL: s_lshr_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s6
-; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s6
-; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshr_b32 s2, s4, s7
+; GFX8-NEXT:    s_lshr_b32 s2, s4, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
-; GFX8-NEXT:    s_lshr_b32 s3, s5, s8
+; GFX8-NEXT:    s_lshr_b32 s3, s5, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s2, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s6
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_lshr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_and_b32 s0, s0, s5
-; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, s2
-; GFX9-NEXT:    s_lshr_b32 s2, s4, s6
+; GFX9-NEXT:    s_lshr_b32 s2, s4, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX9-NEXT:    s_and_b32 s1, s1, s5
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
@@ -999,17 +991,16 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ;
 ; GFX10-LABEL: s_lshr_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s4, 0xffff
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX10-NEXT:    s_and_b32 s0, s0, s4
-; GFX10-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshr_b32 s2, s5, s6
-; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    s_and_b32 s1, s1, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s2, s4, s5
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s3, s5, s4
+; GFX10-NEXT:    s_lshr_b32 s3, s4, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1124,21 +1115,20 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
 ; GFX6-LABEL: s_lshr_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s16, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s16
-; GFX6-NEXT:    s_and_b32 s0, s0, s16
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s9
-; GFX6-NEXT:    s_and_b32 s3, s3, s16
+; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s8
-; GFX6-NEXT:    s_and_b32 s2, s2, s16
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s11
-; GFX6-NEXT:    s_and_b32 s5, s5, s16
-; GFX6-NEXT:    s_and_b32 s7, s7, s16
+; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s10
-; GFX6-NEXT:    s_and_b32 s4, s4, s16
+; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, s13
-; GFX6-NEXT:    s_and_b32 s6, s6, s16
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s7, s7, s15
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
@@ -1153,64 +1143,62 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ;
 ; GFX8-LABEL: s_lshr_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s12, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s12
-; GFX8-NEXT:    s_lshr_b32 s13, s4, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s12
-; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
-; GFX8-NEXT:    s_lshr_b32 s4, s8, s13
+; GFX8-NEXT:    s_lshr_b32 s4, s8, s12
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, s12
-; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
-; GFX8-NEXT:    s_lshr_b32 s5, s9, s14
+; GFX8-NEXT:    s_lshr_b32 s5, s9, s13
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s12
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, s12
-; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
+; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s6
-; GFX8-NEXT:    s_lshr_b32 s6, s10, s15
+; GFX8-NEXT:    s_lshr_b32 s6, s10, s14
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s12
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s7
-; GFX8-NEXT:    s_lshr_b32 s7, s11, s16
+; GFX8-NEXT:    s_lshr_b32 s7, s11, s15
 ; GFX8-NEXT:    s_or_b32 s1, s4, s1
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, s12
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX8-NEXT:    s_or_b32 s2, s4, s2
 ; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, s12
+; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX8-NEXT:    s_or_b32 s3, s4, s3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_lshr_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s9, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX9-NEXT:    s_and_b32 s0, s0, s9
-; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, s4
-; GFX9-NEXT:    s_lshr_b32 s4, s8, s10
+; GFX9-NEXT:    s_lshr_b32 s4, s8, s9
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_and_b32 s1, s1, s9
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s8, s5, 16
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s5
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, s8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    s_and_b32 s2, s2, s9
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s5, s6, 16
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, s9
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s5, s7, 16
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, s7
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, s5
@@ -1219,26 +1207,25 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ;
 ; GFX10-LABEL: s_lshr_v8i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, 0xffff
-; GFX10-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX10-NEXT:    s_and_b32 s0, s0, s8
-; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s9, s10
-; GFX10-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX10-NEXT:    s_and_b32 s1, s1, s8
-; GFX10-NEXT:    s_lshr_b32 s10, s5, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s8, s9
+; GFX10-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s9, s5, 16
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s5
-; GFX10-NEXT:    s_lshr_b32 s5, s9, s10
+; GFX10-NEXT:    s_lshr_b32 s5, s8, s9
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s2, s8
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s5, s6, 16
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    s_and_b32 s3, s3, s8
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s6, s7, 16
 ; GFX10-NEXT:    s_lshr_b32 s3, s3, s7
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, s6

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index ccf6e6be39be3..e5586787ac5b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -12,25 +12,22 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
 ;
 ; GFX8-LABEL: s_mul_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_mul_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    s_and_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_mul_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
-; GFX10-NEXT:    s_and_b32 s0, s0, s2
-; GFX10-NEXT:    s_and_b32 s1, s1, s2
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
@@ -78,29 +75,26 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
 ;
 ; GFX8-LABEL: s_mul_i16_zeroext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_mul_i16_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    s_and_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_mul_i16_zeroext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
-; GFX10-NEXT:    s_and_b32 s0, s0, s2
-; GFX10-NEXT:    s_and_b32 s1, s1, s2
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s0, s0, s2
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
@@ -146,27 +140,24 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
 ;
 ; GFX8-LABEL: s_mul_i16_signext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_mul_i16_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    s_and_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX9-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_mul_i16_signext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
-; GFX10-NEXT:    s_and_b32 s0, s0, s2
-; GFX10-NEXT:    s_and_b32 s1, s1, s2
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX10-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index bec739bd0bf06..e325c8b9af021 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -429,13 +429,12 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
 define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1)
 define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v2i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX6-NEXT:    s_or_b32 s0, s1, s0
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -487,13 +485,12 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre
 define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v2i16_multi_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
 define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
 ; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s3, s4, s1
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
-; GFX6-NEXT:    s_and_b32 s1, s6, s1
-; GFX6-NEXT:    s_or_b32 s1, s3, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
+; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_xor_b32 s2, s2, -1
+; GFX6-NEXT:    s_or_b32 s0, s0, s2
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use:
@@ -630,18 +626,17 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1)
 ; GFX6-LABEL: s_orn2_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_mov_b32 s3, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
-; GFX6-NEXT:    s_or_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
-; GFX6-NEXT:    s_and_b32 s3, s8, s3
-; GFX6-NEXT:    s_or_b32 s3, s4, s3
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
+; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
@@ -673,18 +668,17 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre
 ; GFX6-LABEL: s_orn2_v4i16_commute:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_mov_b32 s3, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
-; GFX6-NEXT:    s_or_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
-; GFX6-NEXT:    s_and_b32 s3, s8, s3
-; GFX6-NEXT:    s_or_b32 s3, s4, s3
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
+; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
@@ -716,18 +710,17 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
 ; GFX6-LABEL: s_orn2_v4i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_mov_b32 s3, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
-; GFX6-NEXT:    s_or_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
-; GFX6-NEXT:    s_and_b32 s3, s8, s3
-; GFX6-NEXT:    s_or_b32 s3, s4, s3
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
+; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
@@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
 define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
 ; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s14, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s1, s2, s14
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX6-NEXT:    s_and_b32 s2, s4, s14
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
-; GFX6-NEXT:    s_and_b32 s3, s6, s14
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, s14
+; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s11, 16
-; GFX6-NEXT:    s_and_b32 s5, s10, s14
+; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
 ; GFX6-NEXT:    s_lshl_b32 s5, s13, 16
-; GFX6-NEXT:    s_and_b32 s6, s12, s14
+; GFX6-NEXT:    s_and_b32 s6, s12, 0xffff
 ; GFX6-NEXT:    s_or_b32 s5, s5, s6
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_mov_b32 s7, s6

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
index 6416026570058..0a2414d397cb9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -401,13 +401,12 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_rndne_f16_e32 v3, v0
+; GFX10-NEXT:    v_rndne_f16_e32 v2, v0
 ; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rndne_f16_e32 v4, v1
+; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v3, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, v4, v2, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
   ret <4 x half> %roundeven
@@ -610,8 +609,8 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s6, 1
-; GFX6-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX6-NEXT:    v_and_b32_e32 v5, s6, v1
+; GFX6-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX6-NEXT:    v_or_b32_e32 v5, s7, v5
 ; GFX6-NEXT:    v_add_f64 v[6:7], v[0:1], v[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index db72cf406c9cd..9bf8d106cf01d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -320,12 +320,11 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
@@ -345,31 +344,28 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_min_i32 s7, s0, 0
+; GFX6-NEXT:    s_min_i32 s5, s0, 0
 ; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_max_i32 s6, s0, 0
-; GFX6-NEXT:    s_sub_i32 s7, s5, s7
-; GFX6-NEXT:    s_sub_i32 s6, s4, s6
-; GFX6-NEXT:    s_max_i32 s1, s7, s1
-; GFX6-NEXT:    s_min_i32 s1, s1, s6
+; GFX6-NEXT:    s_max_i32 s4, s0, 0
+; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
+; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
+; GFX6-NEXT:    s_max_i32 s1, s5, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s4
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
+; GFX6-NEXT:    s_min_i32 s4, s1, 0
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX6-NEXT:    s_max_i32 s3, s1, 0
-; GFX6-NEXT:    s_sub_i32 s3, s4, s3
-; GFX6-NEXT:    s_min_i32 s4, s1, 0
-; GFX6-NEXT:    s_sub_i32 s4, s5, s4
+; GFX6-NEXT:    s_sub_i32 s4, 0x80000000, s4
+; GFX6-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
 ; GFX6-NEXT:    s_max_i32 s2, s4, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s3
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
-; GFX6-NEXT:    s_movk_i32 s2, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
-; GFX6-NEXT:    s_and_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -379,32 +375,30 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_bfe_u32 s4, 8, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX8-NEXT:    s_sext_i32_i16 s7, s0
-; GFX8-NEXT:    s_sext_i32_i16 s8, 0
-; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    s_max_i32 s9, s7, s8
-; GFX8-NEXT:    s_min_i32 s7, s7, s8
+; GFX8-NEXT:    s_sext_i32_i16 s5, s0
+; GFX8-NEXT:    s_sext_i32_i16 s6, 0
+; GFX8-NEXT:    s_max_i32 s7, s5, s6
+; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX8-NEXT:    s_sub_i32 s7, s6, s7
-; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX8-NEXT:    s_sext_i32_i16 s7, s7
+; GFX8-NEXT:    s_sub_i32 s5, 0xffff8000, s5
+; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s9, s5, s9
-; GFX8-NEXT:    s_max_i32 s1, s7, s1
+; GFX8-NEXT:    s_sub_i32 s7, 0x7fff, s7
+; GFX8-NEXT:    s_max_i32 s1, s5, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s7, s9
-; GFX8-NEXT:    s_min_i32 s1, s1, s7
+; GFX8-NEXT:    s_sext_i32_i16 s5, s7
+; GFX8-NEXT:    s_min_i32 s1, s1, s5
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s1
-; GFX8-NEXT:    s_max_i32 s7, s3, s8
-; GFX8-NEXT:    s_min_i32 s3, s3, s8
-; GFX8-NEXT:    s_sub_i32 s3, s6, s3
+; GFX8-NEXT:    s_max_i32 s5, s3, s6
+; GFX8-NEXT:    s_min_i32 s3, s3, s6
+; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s5, s5, s7
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
 ; GFX8-NEXT:    s_max_i32 s2, s3, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
@@ -413,10 +407,9 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s4
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -424,17 +417,16 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-LABEL: s_saddsat_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX9-NEXT:    s_mov_b32 s2, 0x80008
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX9-NEXT:    s_lshl_b32 s2, s3, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
@@ -451,15 +443,14 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s2, s4, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
@@ -639,35 +630,33 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v7
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v3, v6, v2, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v8, v2, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v6
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v5, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v7, v4
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_pk_add_i16 v1, v3, v2 clamp
+; GFX10-NEXT:    v_pk_add_i16 v1, v2, v3 clamp
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -685,59 +674,56 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_min_i32 s11, s0, 0
+; GFX6-NEXT:    s_min_i32 s9, s0, 0
 ; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_max_i32 s10, s0, 0
-; GFX6-NEXT:    s_sub_i32 s11, s9, s11
-; GFX6-NEXT:    s_sub_i32 s10, s8, s10
-; GFX6-NEXT:    s_max_i32 s1, s11, s1
-; GFX6-NEXT:    s_min_i32 s1, s1, s10
+; GFX6-NEXT:    s_max_i32 s8, s0, 0
+; GFX6-NEXT:    s_sub_i32 s9, 0x80000000, s9
+; GFX6-NEXT:    s_sub_i32 s8, 0x7fffffff, s8
+; GFX6-NEXT:    s_max_i32 s1, s9, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s8
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
-; GFX6-NEXT:    s_min_i32 s10, s1, 0
+; GFX6-NEXT:    s_min_i32 s8, s1, 0
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX6-NEXT:    s_max_i32 s5, s1, 0
-; GFX6-NEXT:    s_sub_i32 s10, s9, s10
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_max_i32 s2, s10, s2
+; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
+; GFX6-NEXT:    s_max_i32 s2, s8, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s5
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
 ; GFX6-NEXT:    s_min_i32 s6, s2, 0
 ; GFX6-NEXT:    s_max_i32 s5, s2, 0
-; GFX6-NEXT:    s_sub_i32 s6, s9, s6
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX6-NEXT:    s_max_i32 s3, s6, s3
 ; GFX6-NEXT:    s_min_i32 s3, s3, s5
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
 ; GFX6-NEXT:    s_min_i32 s6, s3, 0
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
 ; GFX6-NEXT:    s_max_i32 s5, s3, 0
-; GFX6-NEXT:    s_sub_i32 s6, s9, s6
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_max_i32 s4, s6, s4
-; GFX6-NEXT:    s_min_i32 s4, s4, s5
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
-; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
+; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
+; GFX6-NEXT:    s_max_i32 s4, s6, s4
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s5
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s4
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, s4
+; GFX6-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -749,48 +735,46 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
-; GFX8-NEXT:    s_sext_i32_i16 s11, s0
-; GFX8-NEXT:    s_sext_i32_i16 s12, 0
-; GFX8-NEXT:    s_movk_i32 s10, 0x8000
-; GFX8-NEXT:    s_max_i32 s13, s11, s12
-; GFX8-NEXT:    s_min_i32 s11, s11, s12
+; GFX8-NEXT:    s_sext_i32_i16 s9, s0
+; GFX8-NEXT:    s_sext_i32_i16 s10, 0
+; GFX8-NEXT:    s_max_i32 s11, s9, s10
+; GFX8-NEXT:    s_min_i32 s9, s9, s10
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
-; GFX8-NEXT:    s_sub_i32 s11, s10, s11
-; GFX8-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX8-NEXT:    s_sext_i32_i16 s11, s11
+; GFX8-NEXT:    s_sub_i32 s9, 0xffff8000, s9
+; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s13, s9, s13
-; GFX8-NEXT:    s_max_i32 s1, s11, s1
+; GFX8-NEXT:    s_sub_i32 s11, 0x7fff, s11
+; GFX8-NEXT:    s_max_i32 s1, s9, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s11, s13
-; GFX8-NEXT:    s_min_i32 s1, s1, s11
+; GFX8-NEXT:    s_sext_i32_i16 s9, s11
+; GFX8-NEXT:    s_min_i32 s1, s1, s9
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s5, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s1
-; GFX8-NEXT:    s_max_i32 s11, s5, s12
-; GFX8-NEXT:    s_min_i32 s5, s5, s12
-; GFX8-NEXT:    s_sub_i32 s5, s10, s5
+; GFX8-NEXT:    s_max_i32 s9, s5, s10
+; GFX8-NEXT:    s_min_i32 s5, s5, s10
+; GFX8-NEXT:    s_sub_i32 s5, 0xffff8000, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s11, s9, s11
+; GFX8-NEXT:    s_sub_i32 s9, 0x7fff, s9
 ; GFX8-NEXT:    s_max_i32 s2, s5, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sext_i32_i16 s5, s11
+; GFX8-NEXT:    s_sext_i32_i16 s5, s9
 ; GFX8-NEXT:    s_min_i32 s2, s2, s5
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX8-NEXT:    s_lshl_b32 s3, s6, s8
-; GFX8-NEXT:    s_max_i32 s6, s5, s12
-; GFX8-NEXT:    s_min_i32 s5, s5, s12
-; GFX8-NEXT:    s_sub_i32 s5, s10, s5
+; GFX8-NEXT:    s_max_i32 s6, s5, s10
+; GFX8-NEXT:    s_min_i32 s5, s5, s10
+; GFX8-NEXT:    s_sub_i32 s5, 0xffff8000, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s6, s9, s6
+; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
 ; GFX8-NEXT:    s_max_i32 s3, s5, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s6
@@ -798,35 +782,34 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s4, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s3
-; GFX8-NEXT:    s_max_i32 s6, s5, s12
-; GFX8-NEXT:    s_min_i32 s5, s5, s12
+; GFX8-NEXT:    s_max_i32 s6, s5, s10
+; GFX8-NEXT:    s_min_i32 s5, s5, s10
 ; GFX8-NEXT:    s_lshl_b32 s4, s7, s8
-; GFX8-NEXT:    s_sub_i32 s5, s10, s5
+; GFX8-NEXT:    s_sub_i32 s5, 0xffff8000, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sub_i32 s6, s9, s6
-; GFX8-NEXT:    s_max_i32 s4, s5, s4
-; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
+; GFX8-NEXT:    s_max_i32 s4, s5, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
-; GFX8-NEXT:    s_add_i32 s3, s3, s4
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_sext_i32_i16 s4, s4
+; GFX8-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s4
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, s8
-; GFX8-NEXT:    s_and_b32 s0, s0, s4
+; GFX8-NEXT:    s_add_i32 s3, s3, s4
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s2, s4
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, s8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, s4
+; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -838,27 +821,26 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_mov_b32 s4, 0x80008
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 0x80008
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
@@ -885,39 +867,37 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_add_i16 v1, s2, s3 clamp
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_b32_e32 v3, s1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, 24
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -1269,19 +1249,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_min_i32 s7, s0, 0
-; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_max_i32 s6, s0, 0
-; GFX6-NEXT:    s_sub_i32 s7, s5, s7
-; GFX6-NEXT:    s_sub_i32 s6, s4, s6
-; GFX6-NEXT:    s_max_i32 s2, s7, s2
-; GFX6-NEXT:    s_min_i32 s2, s2, s6
+; GFX6-NEXT:    s_min_i32 s5, s0, 0
+; GFX6-NEXT:    s_max_i32 s4, s0, 0
+; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
+; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
+; GFX6-NEXT:    s_max_i32 s2, s5, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s4
+; GFX6-NEXT:    s_min_i32 s4, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s2
 ; GFX6-NEXT:    s_max_i32 s2, s1, 0
-; GFX6-NEXT:    s_sub_i32 s2, s4, s2
-; GFX6-NEXT:    s_min_i32 s4, s1, 0
-; GFX6-NEXT:    s_sub_i32 s4, s5, s4
+; GFX6-NEXT:    s_sub_i32 s4, 0x80000000, s4
+; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
 ; GFX6-NEXT:    s_max_i32 s3, s4, s3
 ; GFX6-NEXT:    s_min_i32 s2, s3, s2
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
@@ -1289,19 +1267,17 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ;
 ; GFX8-LABEL: s_saddsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    s_min_i32 s7, s0, 0
-; GFX8-NEXT:    s_brev_b32 s4, -2
-; GFX8-NEXT:    s_max_i32 s6, s0, 0
-; GFX8-NEXT:    s_sub_i32 s7, s5, s7
-; GFX8-NEXT:    s_sub_i32 s6, s4, s6
-; GFX8-NEXT:    s_max_i32 s2, s7, s2
-; GFX8-NEXT:    s_min_i32 s2, s2, s6
+; GFX8-NEXT:    s_min_i32 s5, s0, 0
+; GFX8-NEXT:    s_max_i32 s4, s0, 0
+; GFX8-NEXT:    s_sub_i32 s5, 0x80000000, s5
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
+; GFX8-NEXT:    s_max_i32 s2, s5, s2
+; GFX8-NEXT:    s_min_i32 s2, s2, s4
+; GFX8-NEXT:    s_min_i32 s4, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s2
 ; GFX8-NEXT:    s_max_i32 s2, s1, 0
-; GFX8-NEXT:    s_sub_i32 s2, s4, s2
-; GFX8-NEXT:    s_min_i32 s4, s1, 0
-; GFX8-NEXT:    s_sub_i32 s4, s5, s4
+; GFX8-NEXT:    s_sub_i32 s4, 0x80000000, s4
+; GFX8-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
 ; GFX8-NEXT:    s_max_i32 s3, s4, s3
 ; GFX8-NEXT:    s_min_i32 s2, s3, s2
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
@@ -1408,26 +1384,24 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s7, 1
-; GFX6-NEXT:    s_min_i32 s9, s0, 0
-; GFX6-NEXT:    s_brev_b32 s6, -2
-; GFX6-NEXT:    s_max_i32 s8, s0, 0
-; GFX6-NEXT:    s_sub_i32 s9, s7, s9
-; GFX6-NEXT:    s_sub_i32 s8, s6, s8
-; GFX6-NEXT:    s_max_i32 s3, s9, s3
-; GFX6-NEXT:    s_min_i32 s3, s3, s8
-; GFX6-NEXT:    s_min_i32 s8, s1, 0
+; GFX6-NEXT:    s_min_i32 s7, s0, 0
+; GFX6-NEXT:    s_max_i32 s6, s0, 0
+; GFX6-NEXT:    s_sub_i32 s7, 0x80000000, s7
+; GFX6-NEXT:    s_sub_i32 s6, 0x7fffffff, s6
+; GFX6-NEXT:    s_max_i32 s3, s7, s3
+; GFX6-NEXT:    s_min_i32 s3, s3, s6
+; GFX6-NEXT:    s_min_i32 s6, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s3
 ; GFX6-NEXT:    s_max_i32 s3, s1, 0
-; GFX6-NEXT:    s_sub_i32 s8, s7, s8
-; GFX6-NEXT:    s_sub_i32 s3, s6, s3
-; GFX6-NEXT:    s_max_i32 s4, s8, s4
+; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX6-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
+; GFX6-NEXT:    s_max_i32 s4, s6, s4
 ; GFX6-NEXT:    s_min_i32 s3, s4, s3
 ; GFX6-NEXT:    s_min_i32 s4, s2, 0
 ; GFX6-NEXT:    s_add_i32 s1, s1, s3
 ; GFX6-NEXT:    s_max_i32 s3, s2, 0
-; GFX6-NEXT:    s_sub_i32 s4, s7, s4
-; GFX6-NEXT:    s_sub_i32 s3, s6, s3
+; GFX6-NEXT:    s_sub_i32 s4, 0x80000000, s4
+; GFX6-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
 ; GFX6-NEXT:    s_max_i32 s4, s4, s5
 ; GFX6-NEXT:    s_min_i32 s3, s4, s3
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
@@ -1435,26 +1409,24 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ;
 ; GFX8-LABEL: s_saddsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s7, 1
-; GFX8-NEXT:    s_min_i32 s9, s0, 0
-; GFX8-NEXT:    s_brev_b32 s6, -2
-; GFX8-NEXT:    s_max_i32 s8, s0, 0
-; GFX8-NEXT:    s_sub_i32 s9, s7, s9
-; GFX8-NEXT:    s_sub_i32 s8, s6, s8
-; GFX8-NEXT:    s_max_i32 s3, s9, s3
-; GFX8-NEXT:    s_min_i32 s3, s3, s8
-; GFX8-NEXT:    s_min_i32 s8, s1, 0
+; GFX8-NEXT:    s_min_i32 s7, s0, 0
+; GFX8-NEXT:    s_max_i32 s6, s0, 0
+; GFX8-NEXT:    s_sub_i32 s7, 0x80000000, s7
+; GFX8-NEXT:    s_sub_i32 s6, 0x7fffffff, s6
+; GFX8-NEXT:    s_max_i32 s3, s7, s3
+; GFX8-NEXT:    s_min_i32 s3, s3, s6
+; GFX8-NEXT:    s_min_i32 s6, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s3
 ; GFX8-NEXT:    s_max_i32 s3, s1, 0
-; GFX8-NEXT:    s_sub_i32 s8, s7, s8
-; GFX8-NEXT:    s_sub_i32 s3, s6, s3
-; GFX8-NEXT:    s_max_i32 s4, s8, s4
+; GFX8-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX8-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
+; GFX8-NEXT:    s_max_i32 s4, s6, s4
 ; GFX8-NEXT:    s_min_i32 s3, s4, s3
 ; GFX8-NEXT:    s_min_i32 s4, s2, 0
 ; GFX8-NEXT:    s_add_i32 s1, s1, s3
 ; GFX8-NEXT:    s_max_i32 s3, s2, 0
-; GFX8-NEXT:    s_sub_i32 s4, s7, s4
-; GFX8-NEXT:    s_sub_i32 s3, s6, s3
+; GFX8-NEXT:    s_sub_i32 s4, 0x80000000, s4
+; GFX8-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
 ; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_min_i32 s3, s4, s3
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
@@ -1582,33 +1554,31 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_min_i32 s11, s0, 0
-; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_max_i32 s10, s0, 0
-; GFX6-NEXT:    s_sub_i32 s11, s9, s11
-; GFX6-NEXT:    s_sub_i32 s10, s8, s10
-; GFX6-NEXT:    s_max_i32 s4, s11, s4
-; GFX6-NEXT:    s_min_i32 s4, s4, s10
-; GFX6-NEXT:    s_min_i32 s10, s1, 0
+; GFX6-NEXT:    s_min_i32 s9, s0, 0
+; GFX6-NEXT:    s_max_i32 s8, s0, 0
+; GFX6-NEXT:    s_sub_i32 s9, 0x80000000, s9
+; GFX6-NEXT:    s_sub_i32 s8, 0x7fffffff, s8
+; GFX6-NEXT:    s_max_i32 s4, s9, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s8
+; GFX6-NEXT:    s_min_i32 s8, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s4
 ; GFX6-NEXT:    s_max_i32 s4, s1, 0
-; GFX6-NEXT:    s_sub_i32 s10, s9, s10
-; GFX6-NEXT:    s_sub_i32 s4, s8, s4
-; GFX6-NEXT:    s_max_i32 s5, s10, s5
+; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
+; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
+; GFX6-NEXT:    s_max_i32 s5, s8, s5
 ; GFX6-NEXT:    s_min_i32 s4, s5, s4
 ; GFX6-NEXT:    s_min_i32 s5, s2, 0
 ; GFX6-NEXT:    s_add_i32 s1, s1, s4
 ; GFX6-NEXT:    s_max_i32 s4, s2, 0
-; GFX6-NEXT:    s_sub_i32 s5, s9, s5
-; GFX6-NEXT:    s_sub_i32 s4, s8, s4
+; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
+; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
 ; GFX6-NEXT:    s_max_i32 s5, s5, s6
 ; GFX6-NEXT:    s_min_i32 s4, s5, s4
 ; GFX6-NEXT:    s_min_i32 s5, s3, 0
 ; GFX6-NEXT:    s_add_i32 s2, s2, s4
 ; GFX6-NEXT:    s_max_i32 s4, s3, 0
-; GFX6-NEXT:    s_sub_i32 s5, s9, s5
-; GFX6-NEXT:    s_sub_i32 s4, s8, s4
+; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
+; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
 ; GFX6-NEXT:    s_max_i32 s5, s5, s7
 ; GFX6-NEXT:    s_min_i32 s4, s5, s4
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
@@ -1616,33 +1586,31 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ;
 ; GFX8-LABEL: s_saddsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s9, 1
-; GFX8-NEXT:    s_min_i32 s11, s0, 0
-; GFX8-NEXT:    s_brev_b32 s8, -2
-; GFX8-NEXT:    s_max_i32 s10, s0, 0
-; GFX8-NEXT:    s_sub_i32 s11, s9, s11
-; GFX8-NEXT:    s_sub_i32 s10, s8, s10
-; GFX8-NEXT:    s_max_i32 s4, s11, s4
-; GFX8-NEXT:    s_min_i32 s4, s4, s10
-; GFX8-NEXT:    s_min_i32 s10, s1, 0
+; GFX8-NEXT:    s_min_i32 s9, s0, 0
+; GFX8-NEXT:    s_max_i32 s8, s0, 0
+; GFX8-NEXT:    s_sub_i32 s9, 0x80000000, s9
+; GFX8-NEXT:    s_sub_i32 s8, 0x7fffffff, s8
+; GFX8-NEXT:    s_max_i32 s4, s9, s4
+; GFX8-NEXT:    s_min_i32 s4, s4, s8
+; GFX8-NEXT:    s_min_i32 s8, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s4
 ; GFX8-NEXT:    s_max_i32 s4, s1, 0
-; GFX8-NEXT:    s_sub_i32 s10, s9, s10
-; GFX8-NEXT:    s_sub_i32 s4, s8, s4
-; GFX8-NEXT:    s_max_i32 s5, s10, s5
+; GFX8-NEXT:    s_sub_i32 s8, 0x80000000, s8
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
+; GFX8-NEXT:    s_max_i32 s5, s8, s5
 ; GFX8-NEXT:    s_min_i32 s4, s5, s4
 ; GFX8-NEXT:    s_min_i32 s5, s2, 0
 ; GFX8-NEXT:    s_add_i32 s1, s1, s4
 ; GFX8-NEXT:    s_max_i32 s4, s2, 0
-; GFX8-NEXT:    s_sub_i32 s5, s9, s5
-; GFX8-NEXT:    s_sub_i32 s4, s8, s4
+; GFX8-NEXT:    s_sub_i32 s5, 0x80000000, s5
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
 ; GFX8-NEXT:    s_max_i32 s5, s5, s6
 ; GFX8-NEXT:    s_min_i32 s4, s5, s4
 ; GFX8-NEXT:    s_min_i32 s5, s3, 0
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8-NEXT:    s_max_i32 s4, s3, 0
-; GFX8-NEXT:    s_sub_i32 s5, s9, s5
-; GFX8-NEXT:    s_sub_i32 s4, s8, s4
+; GFX8-NEXT:    s_sub_i32 s5, 0x80000000, s5
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
 ; GFX8-NEXT:    s_max_i32 s5, s5, s7
 ; GFX8-NEXT:    s_min_i32 s4, s5, s4
 ; GFX8-NEXT:    s_add_i32 s3, s3, s4
@@ -1704,21 +1672,20 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v7
-; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v13, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v4
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v13, v6
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v9
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
@@ -1748,21 +1715,20 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v6, v7
-; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v3
 ; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v3
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v13, v6
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v6, v8
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v4
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v13, v6
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v6, v9
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
@@ -1795,40 +1761,38 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s11, 1
-; GFX6-NEXT:    s_min_i32 s13, s0, 0
-; GFX6-NEXT:    s_brev_b32 s10, -2
-; GFX6-NEXT:    s_max_i32 s12, s0, 0
-; GFX6-NEXT:    s_sub_i32 s13, s11, s13
-; GFX6-NEXT:    s_sub_i32 s12, s10, s12
-; GFX6-NEXT:    s_max_i32 s5, s13, s5
-; GFX6-NEXT:    s_min_i32 s5, s5, s12
-; GFX6-NEXT:    s_min_i32 s12, s1, 0
+; GFX6-NEXT:    s_min_i32 s11, s0, 0
+; GFX6-NEXT:    s_max_i32 s10, s0, 0
+; GFX6-NEXT:    s_sub_i32 s11, 0x80000000, s11
+; GFX6-NEXT:    s_sub_i32 s10, 0x7fffffff, s10
+; GFX6-NEXT:    s_max_i32 s5, s11, s5
+; GFX6-NEXT:    s_min_i32 s5, s5, s10
+; GFX6-NEXT:    s_min_i32 s10, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s5
 ; GFX6-NEXT:    s_max_i32 s5, s1, 0
-; GFX6-NEXT:    s_sub_i32 s12, s11, s12
-; GFX6-NEXT:    s_sub_i32 s5, s10, s5
-; GFX6-NEXT:    s_max_i32 s6, s12, s6
+; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
+; GFX6-NEXT:    s_max_i32 s6, s10, s6
 ; GFX6-NEXT:    s_min_i32 s5, s6, s5
 ; GFX6-NEXT:    s_min_i32 s6, s2, 0
 ; GFX6-NEXT:    s_add_i32 s1, s1, s5
 ; GFX6-NEXT:    s_max_i32 s5, s2, 0
-; GFX6-NEXT:    s_sub_i32 s6, s11, s6
-; GFX6-NEXT:    s_sub_i32 s5, s10, s5
+; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX6-NEXT:    s_max_i32 s6, s6, s7
 ; GFX6-NEXT:    s_min_i32 s5, s6, s5
 ; GFX6-NEXT:    s_min_i32 s6, s3, 0
 ; GFX6-NEXT:    s_add_i32 s2, s2, s5
 ; GFX6-NEXT:    s_max_i32 s5, s3, 0
-; GFX6-NEXT:    s_sub_i32 s6, s11, s6
-; GFX6-NEXT:    s_sub_i32 s5, s10, s5
+; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX6-NEXT:    s_max_i32 s6, s6, s8
 ; GFX6-NEXT:    s_min_i32 s5, s6, s5
 ; GFX6-NEXT:    s_min_i32 s6, s4, 0
 ; GFX6-NEXT:    s_add_i32 s3, s3, s5
 ; GFX6-NEXT:    s_max_i32 s5, s4, 0
-; GFX6-NEXT:    s_sub_i32 s6, s11, s6
-; GFX6-NEXT:    s_sub_i32 s5, s10, s5
+; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX6-NEXT:    s_max_i32 s6, s6, s9
 ; GFX6-NEXT:    s_min_i32 s5, s6, s5
 ; GFX6-NEXT:    s_add_i32 s4, s4, s5
@@ -1836,40 +1800,38 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ;
 ; GFX8-LABEL: s_saddsat_v5i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s11, 1
-; GFX8-NEXT:    s_min_i32 s13, s0, 0
-; GFX8-NEXT:    s_brev_b32 s10, -2
-; GFX8-NEXT:    s_max_i32 s12, s0, 0
-; GFX8-NEXT:    s_sub_i32 s13, s11, s13
-; GFX8-NEXT:    s_sub_i32 s12, s10, s12
-; GFX8-NEXT:    s_max_i32 s5, s13, s5
-; GFX8-NEXT:    s_min_i32 s5, s5, s12
-; GFX8-NEXT:    s_min_i32 s12, s1, 0
+; GFX8-NEXT:    s_min_i32 s11, s0, 0
+; GFX8-NEXT:    s_max_i32 s10, s0, 0
+; GFX8-NEXT:    s_sub_i32 s11, 0x80000000, s11
+; GFX8-NEXT:    s_sub_i32 s10, 0x7fffffff, s10
+; GFX8-NEXT:    s_max_i32 s5, s11, s5
+; GFX8-NEXT:    s_min_i32 s5, s5, s10
+; GFX8-NEXT:    s_min_i32 s10, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s5
 ; GFX8-NEXT:    s_max_i32 s5, s1, 0
-; GFX8-NEXT:    s_sub_i32 s12, s11, s12
-; GFX8-NEXT:    s_sub_i32 s5, s10, s5
-; GFX8-NEXT:    s_max_i32 s6, s12, s6
+; GFX8-NEXT:    s_sub_i32 s10, 0x80000000, s10
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
+; GFX8-NEXT:    s_max_i32 s6, s10, s6
 ; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_min_i32 s6, s2, 0
 ; GFX8-NEXT:    s_add_i32 s1, s1, s5
 ; GFX8-NEXT:    s_max_i32 s5, s2, 0
-; GFX8-NEXT:    s_sub_i32 s6, s11, s6
-; GFX8-NEXT:    s_sub_i32 s5, s10, s5
+; GFX8-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX8-NEXT:    s_max_i32 s6, s6, s7
 ; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_min_i32 s6, s3, 0
 ; GFX8-NEXT:    s_add_i32 s2, s2, s5
 ; GFX8-NEXT:    s_max_i32 s5, s3, 0
-; GFX8-NEXT:    s_sub_i32 s6, s11, s6
-; GFX8-NEXT:    s_sub_i32 s5, s10, s5
+; GFX8-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX8-NEXT:    s_max_i32 s6, s6, s8
 ; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_min_i32 s6, s4, 0
 ; GFX8-NEXT:    s_add_i32 s3, s3, s5
 ; GFX8-NEXT:    s_max_i32 s5, s4, 0
-; GFX8-NEXT:    s_sub_i32 s6, s11, s6
-; GFX8-NEXT:    s_sub_i32 s5, s10, s5
+; GFX8-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX8-NEXT:    s_max_i32 s6, s6, s9
 ; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_add_i32 s4, s4, s5
@@ -2211,117 +2173,115 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s33, 1
-; GFX6-NEXT:    s_min_i32 s35, s0, 0
-; GFX6-NEXT:    s_brev_b32 s32, -2
-; GFX6-NEXT:    s_max_i32 s34, s0, 0
-; GFX6-NEXT:    s_sub_i32 s35, s33, s35
-; GFX6-NEXT:    s_sub_i32 s34, s32, s34
-; GFX6-NEXT:    s_max_i32 s16, s35, s16
-; GFX6-NEXT:    s_min_i32 s16, s16, s34
-; GFX6-NEXT:    s_min_i32 s34, s1, 0
+; GFX6-NEXT:    s_min_i32 s33, s0, 0
+; GFX6-NEXT:    s_max_i32 s32, s0, 0
+; GFX6-NEXT:    s_sub_i32 s33, 0x80000000, s33
+; GFX6-NEXT:    s_sub_i32 s32, 0x7fffffff, s32
+; GFX6-NEXT:    s_max_i32 s16, s33, s16
+; GFX6-NEXT:    s_min_i32 s16, s16, s32
+; GFX6-NEXT:    s_min_i32 s32, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s16
 ; GFX6-NEXT:    s_max_i32 s16, s1, 0
-; GFX6-NEXT:    s_sub_i32 s34, s33, s34
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_max_i32 s17, s34, s17
+; GFX6-NEXT:    s_sub_i32 s32, 0x80000000, s32
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
+; GFX6-NEXT:    s_max_i32 s17, s32, s17
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s2, 0
 ; GFX6-NEXT:    s_add_i32 s1, s1, s16
 ; GFX6-NEXT:    s_max_i32 s16, s2, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s18
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s3, 0
 ; GFX6-NEXT:    s_add_i32 s2, s2, s16
 ; GFX6-NEXT:    s_max_i32 s16, s3, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s19
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s4, 0
 ; GFX6-NEXT:    s_add_i32 s3, s3, s16
 ; GFX6-NEXT:    s_max_i32 s16, s4, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s20
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s5, 0
 ; GFX6-NEXT:    s_add_i32 s4, s4, s16
 ; GFX6-NEXT:    s_max_i32 s16, s5, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s21
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s6, 0
 ; GFX6-NEXT:    s_add_i32 s5, s5, s16
 ; GFX6-NEXT:    s_max_i32 s16, s6, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s22
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s7, 0
 ; GFX6-NEXT:    s_add_i32 s6, s6, s16
 ; GFX6-NEXT:    s_max_i32 s16, s7, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s23
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s8, 0
 ; GFX6-NEXT:    s_add_i32 s7, s7, s16
 ; GFX6-NEXT:    s_max_i32 s16, s8, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s24
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s9, 0
 ; GFX6-NEXT:    s_add_i32 s8, s8, s16
 ; GFX6-NEXT:    s_max_i32 s16, s9, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s25
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s10, 0
 ; GFX6-NEXT:    s_add_i32 s9, s9, s16
 ; GFX6-NEXT:    s_max_i32 s16, s10, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s26
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s11, 0
 ; GFX6-NEXT:    s_add_i32 s10, s10, s16
 ; GFX6-NEXT:    s_max_i32 s16, s11, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s27
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s12, 0
 ; GFX6-NEXT:    s_add_i32 s11, s11, s16
 ; GFX6-NEXT:    s_max_i32 s16, s12, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s28
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s13, 0
 ; GFX6-NEXT:    s_add_i32 s12, s12, s16
 ; GFX6-NEXT:    s_max_i32 s16, s13, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s29
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s14, 0
 ; GFX6-NEXT:    s_add_i32 s13, s13, s16
 ; GFX6-NEXT:    s_max_i32 s16, s14, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s30
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s15, 0
 ; GFX6-NEXT:    s_add_i32 s14, s14, s16
 ; GFX6-NEXT:    s_max_i32 s16, s15, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s31
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s15, s15, s16
@@ -2329,117 +2289,115 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ;
 ; GFX8-LABEL: s_saddsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s33, 1
-; GFX8-NEXT:    s_min_i32 s35, s0, 0
-; GFX8-NEXT:    s_brev_b32 s32, -2
-; GFX8-NEXT:    s_max_i32 s34, s0, 0
-; GFX8-NEXT:    s_sub_i32 s35, s33, s35
-; GFX8-NEXT:    s_sub_i32 s34, s32, s34
-; GFX8-NEXT:    s_max_i32 s16, s35, s16
-; GFX8-NEXT:    s_min_i32 s16, s16, s34
-; GFX8-NEXT:    s_min_i32 s34, s1, 0
+; GFX8-NEXT:    s_min_i32 s33, s0, 0
+; GFX8-NEXT:    s_max_i32 s32, s0, 0
+; GFX8-NEXT:    s_sub_i32 s33, 0x80000000, s33
+; GFX8-NEXT:    s_sub_i32 s32, 0x7fffffff, s32
+; GFX8-NEXT:    s_max_i32 s16, s33, s16
+; GFX8-NEXT:    s_min_i32 s16, s16, s32
+; GFX8-NEXT:    s_min_i32 s32, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s16
 ; GFX8-NEXT:    s_max_i32 s16, s1, 0
-; GFX8-NEXT:    s_sub_i32 s34, s33, s34
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_max_i32 s17, s34, s17
+; GFX8-NEXT:    s_sub_i32 s32, 0x80000000, s32
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
+; GFX8-NEXT:    s_max_i32 s17, s32, s17
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s2, 0
 ; GFX8-NEXT:    s_add_i32 s1, s1, s16
 ; GFX8-NEXT:    s_max_i32 s16, s2, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s18
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s3, 0
 ; GFX8-NEXT:    s_add_i32 s2, s2, s16
 ; GFX8-NEXT:    s_max_i32 s16, s3, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s19
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s4, 0
 ; GFX8-NEXT:    s_add_i32 s3, s3, s16
 ; GFX8-NEXT:    s_max_i32 s16, s4, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s20
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s5, 0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s16
 ; GFX8-NEXT:    s_max_i32 s16, s5, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s21
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s6, 0
 ; GFX8-NEXT:    s_add_i32 s5, s5, s16
 ; GFX8-NEXT:    s_max_i32 s16, s6, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s22
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s7, 0
 ; GFX8-NEXT:    s_add_i32 s6, s6, s16
 ; GFX8-NEXT:    s_max_i32 s16, s7, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s23
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s8, 0
 ; GFX8-NEXT:    s_add_i32 s7, s7, s16
 ; GFX8-NEXT:    s_max_i32 s16, s8, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s24
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s9, 0
 ; GFX8-NEXT:    s_add_i32 s8, s8, s16
 ; GFX8-NEXT:    s_max_i32 s16, s9, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s25
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s10, 0
 ; GFX8-NEXT:    s_add_i32 s9, s9, s16
 ; GFX8-NEXT:    s_max_i32 s16, s10, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s26
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s11, 0
 ; GFX8-NEXT:    s_add_i32 s10, s10, s16
 ; GFX8-NEXT:    s_max_i32 s16, s11, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s27
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s12, 0
 ; GFX8-NEXT:    s_add_i32 s11, s11, s16
 ; GFX8-NEXT:    s_max_i32 s16, s12, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s28
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s13, 0
 ; GFX8-NEXT:    s_add_i32 s12, s12, s16
 ; GFX8-NEXT:    s_max_i32 s16, s13, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s29
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s14, 0
 ; GFX8-NEXT:    s_add_i32 s13, s13, s16
 ; GFX8-NEXT:    s_max_i32 s16, s14, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s30
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s15, 0
 ; GFX8-NEXT:    s_add_i32 s14, s14, s16
 ; GFX8-NEXT:    s_max_i32 s16, s15, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s31
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s15, s15, s16
@@ -2781,60 +2739,55 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-LABEL: s_saddsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_min_i32 s7, s0, 0
+; GFX6-NEXT:    s_min_i32 s5, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_max_i32 s6, s0, 0
-; GFX6-NEXT:    s_sub_i32 s7, s5, s7
-; GFX6-NEXT:    s_sub_i32 s6, s4, s6
-; GFX6-NEXT:    s_max_i32 s2, s7, s2
-; GFX6-NEXT:    s_min_i32 s2, s2, s6
+; GFX6-NEXT:    s_max_i32 s4, s0, 0
+; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
+; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
+; GFX6-NEXT:    s_max_i32 s2, s5, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_i32 s2, s2, s4
+; GFX6-NEXT:    s_min_i32 s4, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_max_i32 s3, s1, 0
-; GFX6-NEXT:    s_sub_i32 s3, s4, s3
-; GFX6-NEXT:    s_min_i32 s4, s1, 0
-; GFX6-NEXT:    s_sub_i32 s4, s5, s4
+; GFX6-NEXT:    s_sub_i32 s4, 0x80000000, s4
+; GFX6-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
 ; GFX6-NEXT:    s_max_i32 s2, s4, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s3
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s6, s0
-; GFX8-NEXT:    s_sext_i32_i16 s7, 0
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    s_max_i32 s8, s6, s7
-; GFX8-NEXT:    s_min_i32 s6, s6, s7
-; GFX8-NEXT:    s_sub_i32 s6, s5, s6
+; GFX8-NEXT:    s_sext_i32_i16 s4, s0
+; GFX8-NEXT:    s_sext_i32_i16 s5, 0
+; GFX8-NEXT:    s_max_i32 s6, s4, s5
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    s_sext_i32_i16 s6, s6
+; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s8, s4, s8
-; GFX8-NEXT:    s_max_i32 s1, s6, s1
+; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
+; GFX8-NEXT:    s_max_i32 s1, s4, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s6, s8
+; GFX8-NEXT:    s_sext_i32_i16 s4, s6
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_min_i32 s1, s1, s6
+; GFX8-NEXT:    s_min_i32 s1, s1, s4
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s2
-; GFX8-NEXT:    s_max_i32 s6, s1, s7
-; GFX8-NEXT:    s_min_i32 s1, s1, s7
-; GFX8-NEXT:    s_sub_i32 s1, s5, s1
+; GFX8-NEXT:    s_max_i32 s4, s1, s5
+; GFX8-NEXT:    s_min_i32 s1, s1, s5
+; GFX8-NEXT:    s_sub_i32 s1, 0xffff8000, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
 ; GFX8-NEXT:    s_max_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s4
@@ -2867,22 +2820,20 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: saddsat_v2i16_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s3, 1
-; GFX6-NEXT:    s_min_i32 s5, s0, 0
+; GFX6-NEXT:    s_min_i32 s3, s0, 0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s2, -2
-; GFX6-NEXT:    s_max_i32 s4, s0, 0
-; GFX6-NEXT:    s_sub_i32 s5, s3, s5
-; GFX6-NEXT:    s_sub_i32 s4, s2, s4
-; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
-; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
+; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
+; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
+; GFX6-NEXT:    v_max_i32_e32 v0, s3, v0
+; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX6-NEXT:    s_max_i32 s1, s0, 0
-; GFX6-NEXT:    s_sub_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s2, s0, 0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_sub_i32 s2, s3, s2
+; GFX6-NEXT:    s_max_i32 s1, s0, 0
+; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
+; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
 ; GFX6-NEXT:    v_max_i32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
@@ -2897,25 +2848,23 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ;
 ; GFX8-LABEL: saddsat_v2i16_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s4, s0
-; GFX8-NEXT:    s_sext_i32_i16 s5, 0
-; GFX8-NEXT:    s_movk_i32 s3, 0x8000
-; GFX8-NEXT:    s_max_i32 s6, s4, s5
-; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_sext_i32_i16 s2, s0
+; GFX8-NEXT:    s_sext_i32_i16 s3, 0
+; GFX8-NEXT:    s_max_i32 s4, s2, s3
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX8-NEXT:    s_sub_i32 s4, s3, s4
-; GFX8-NEXT:    s_sub_i32 s6, s2, s6
-; GFX8-NEXT:    v_max_i16_e32 v1, s4, v0
-; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    v_min_i16_e32 v1, s6, v1
-; GFX8-NEXT:    s_max_i32 s6, s4, s5
-; GFX8-NEXT:    s_min_i32 s4, s4, s5
-; GFX8-NEXT:    s_sub_i32 s3, s3, s4
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    s_sub_i32 s2, s2, s6
+; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
+; GFX8-NEXT:    v_max_i16_e32 v1, s2, v0
+; GFX8-NEXT:    s_sext_i32_i16 s2, s1
+; GFX8-NEXT:    v_min_i16_e32 v1, s4, v1
+; GFX8-NEXT:    s_max_i32 s4, s2, s3
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
+; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v0, s2, v0
+; GFX8-NEXT:    v_min_i16_e32 v0, s4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_add_u16_e32 v1, s0, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3137,31 +3086,29 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-LABEL: s_saddsat_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_min_i32 s11, s0, 0
+; GFX6-NEXT:    s_min_i32 s9, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_max_i32 s10, s0, 0
-; GFX6-NEXT:    s_sub_i32 s11, s9, s11
-; GFX6-NEXT:    s_sub_i32 s10, s8, s10
-; GFX6-NEXT:    s_max_i32 s4, s11, s4
+; GFX6-NEXT:    s_max_i32 s8, s0, 0
+; GFX6-NEXT:    s_sub_i32 s9, 0x80000000, s9
+; GFX6-NEXT:    s_sub_i32 s8, 0x7fffffff, s8
+; GFX6-NEXT:    s_max_i32 s4, s9, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_min_i32 s4, s4, s10
-; GFX6-NEXT:    s_min_i32 s10, s1, 0
+; GFX6-NEXT:    s_min_i32 s4, s4, s8
+; GFX6-NEXT:    s_min_i32 s8, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
 ; GFX6-NEXT:    s_max_i32 s5, s1, 0
-; GFX6-NEXT:    s_sub_i32 s10, s9, s10
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_max_i32 s4, s10, s4
+; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
+; GFX6-NEXT:    s_max_i32 s4, s8, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_add_i32 s1, s1, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
 ; GFX6-NEXT:    s_min_i32 s6, s2, 0
 ; GFX6-NEXT:    s_max_i32 s5, s2, 0
-; GFX6-NEXT:    s_sub_i32 s6, s9, s6
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX6-NEXT:    s_max_i32 s4, s6, s4
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
@@ -3169,65 +3116,62 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_add_i32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
 ; GFX6-NEXT:    s_max_i32 s5, s3, 0
-; GFX6-NEXT:    s_sub_i32 s6, s9, s6
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
+; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX6-NEXT:    s_max_i32 s4, s6, s4
-; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_add_i32 s3, s3, s4
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s4
-; GFX6-NEXT:    s_and_b32 s2, s3, s4
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s10, s0
-; GFX8-NEXT:    s_sext_i32_i16 s11, 0
-; GFX8-NEXT:    s_movk_i32 s9, 0x8000
-; GFX8-NEXT:    s_max_i32 s12, s10, s11
-; GFX8-NEXT:    s_min_i32 s10, s10, s11
-; GFX8-NEXT:    s_sub_i32 s10, s9, s10
+; GFX8-NEXT:    s_sext_i32_i16 s8, s0
+; GFX8-NEXT:    s_sext_i32_i16 s9, 0
+; GFX8-NEXT:    s_max_i32 s10, s8, s9
+; GFX8-NEXT:    s_min_i32 s8, s8, s9
+; GFX8-NEXT:    s_sub_i32 s8, 0xffff8000, s8
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX8-NEXT:    s_sext_i32_i16 s10, s10
+; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s12, s8, s12
-; GFX8-NEXT:    s_max_i32 s2, s10, s2
+; GFX8-NEXT:    s_sub_i32 s10, 0x7fff, s10
+; GFX8-NEXT:    s_max_i32 s2, s8, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sext_i32_i16 s10, s12
+; GFX8-NEXT:    s_sext_i32_i16 s8, s10
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_min_i32 s2, s2, s10
+; GFX8-NEXT:    s_min_i32 s2, s2, s8
 ; GFX8-NEXT:    s_add_i32 s0, s0, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s4
-; GFX8-NEXT:    s_max_i32 s10, s2, s11
-; GFX8-NEXT:    s_min_i32 s2, s2, s11
-; GFX8-NEXT:    s_sub_i32 s2, s9, s2
+; GFX8-NEXT:    s_max_i32 s8, s2, s9
+; GFX8-NEXT:    s_min_i32 s2, s2, s9
+; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_sub_i32 s10, s8, s10
+; GFX8-NEXT:    s_sub_i32 s8, 0x7fff, s8
 ; GFX8-NEXT:    s_max_i32 s2, s2, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sext_i32_i16 s6, s10
+; GFX8-NEXT:    s_sext_i32_i16 s6, s8
 ; GFX8-NEXT:    s_min_i32 s2, s2, s6
 ; GFX8-NEXT:    s_add_i32 s4, s4, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s1
-; GFX8-NEXT:    s_max_i32 s6, s2, s11
-; GFX8-NEXT:    s_min_i32 s2, s2, s11
-; GFX8-NEXT:    s_sub_i32 s2, s9, s2
+; GFX8-NEXT:    s_max_i32 s6, s2, s9
+; GFX8-NEXT:    s_min_i32 s2, s2, s9
+; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s6, s8, s6
+; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
 ; GFX8-NEXT:    s_max_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s6
@@ -3235,12 +3179,12 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s5
-; GFX8-NEXT:    s_max_i32 s3, s2, s11
-; GFX8-NEXT:    s_min_i32 s2, s2, s11
-; GFX8-NEXT:    s_sub_i32 s2, s9, s2
+; GFX8-NEXT:    s_max_i32 s3, s2, s9
+; GFX8-NEXT:    s_min_i32 s2, s2, s9
+; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s7
-; GFX8-NEXT:    s_sub_i32 s3, s8, s3
+; GFX8-NEXT:    s_sub_i32 s3, 0x7fff, s3
 ; GFX8-NEXT:    s_max_i32 s2, s2, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
@@ -3454,31 +3398,29 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-LABEL: s_saddsat_v6i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s13, 1
-; GFX6-NEXT:    s_min_i32 s15, s0, 0
+; GFX6-NEXT:    s_min_i32 s13, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_brev_b32 s12, -2
-; GFX6-NEXT:    s_max_i32 s14, s0, 0
-; GFX6-NEXT:    s_sub_i32 s15, s13, s15
-; GFX6-NEXT:    s_sub_i32 s14, s12, s14
-; GFX6-NEXT:    s_max_i32 s6, s15, s6
+; GFX6-NEXT:    s_max_i32 s12, s0, 0
+; GFX6-NEXT:    s_sub_i32 s13, 0x80000000, s13
+; GFX6-NEXT:    s_sub_i32 s12, 0x7fffffff, s12
+; GFX6-NEXT:    s_max_i32 s6, s13, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_min_i32 s6, s6, s14
-; GFX6-NEXT:    s_min_i32 s14, s1, 0
+; GFX6-NEXT:    s_min_i32 s6, s6, s12
+; GFX6-NEXT:    s_min_i32 s12, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
 ; GFX6-NEXT:    s_max_i32 s7, s1, 0
-; GFX6-NEXT:    s_sub_i32 s14, s13, s14
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
-; GFX6-NEXT:    s_max_i32 s6, s14, s6
+; GFX6-NEXT:    s_sub_i32 s12, 0x80000000, s12
+; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
+; GFX6-NEXT:    s_max_i32 s6, s12, s6
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_add_i32 s1, s1, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
 ; GFX6-NEXT:    s_min_i32 s8, s2, 0
 ; GFX6-NEXT:    s_max_i32 s7, s2, 0
-; GFX6-NEXT:    s_sub_i32 s8, s13, s8
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
+; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
+; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
 ; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
@@ -3486,8 +3428,8 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_add_i32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
 ; GFX6-NEXT:    s_max_i32 s7, s3, 0
-; GFX6-NEXT:    s_sub_i32 s8, s13, s8
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
+; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
+; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
 ; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
@@ -3495,8 +3437,8 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_add_i32 s3, s3, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
 ; GFX6-NEXT:    s_max_i32 s7, s4, 0
-; GFX6-NEXT:    s_sub_i32 s8, s13, s8
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
+; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
+; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
 ; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
@@ -3504,71 +3446,68 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_add_i32 s4, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
 ; GFX6-NEXT:    s_max_i32 s7, s5, 0
-; GFX6-NEXT:    s_sub_i32 s8, s13, s8
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
-; GFX6-NEXT:    s_max_i32 s6, s8, s6
-; GFX6-NEXT:    s_min_i32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_add_i32 s5, s5, s6
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
+; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
+; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s6
+; GFX6-NEXT:    s_min_i32 s6, s6, s7
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s0, s0, s6
+; GFX6-NEXT:    s_add_i32 s5, s5, s6
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s6
-; GFX6-NEXT:    s_and_b32 s2, s3, s6
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s6
+; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s6
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s14, s0
-; GFX8-NEXT:    s_sext_i32_i16 s15, 0
-; GFX8-NEXT:    s_movk_i32 s13, 0x8000
-; GFX8-NEXT:    s_max_i32 s16, s14, s15
-; GFX8-NEXT:    s_min_i32 s14, s14, s15
-; GFX8-NEXT:    s_sub_i32 s14, s13, s14
+; GFX8-NEXT:    s_sext_i32_i16 s12, s0
+; GFX8-NEXT:    s_sext_i32_i16 s13, 0
+; GFX8-NEXT:    s_max_i32 s14, s12, s13
+; GFX8-NEXT:    s_min_i32 s12, s12, s13
+; GFX8-NEXT:    s_sub_i32 s12, 0xffff8000, s12
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX8-NEXT:    s_sext_i32_i16 s14, s14
+; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s16, s12, s16
-; GFX8-NEXT:    s_max_i32 s3, s14, s3
+; GFX8-NEXT:    s_sub_i32 s14, 0x7fff, s14
+; GFX8-NEXT:    s_max_i32 s3, s12, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sext_i32_i16 s14, s16
+; GFX8-NEXT:    s_sext_i32_i16 s12, s14
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX8-NEXT:    s_min_i32 s3, s3, s14
+; GFX8-NEXT:    s_min_i32 s3, s3, s12
 ; GFX8-NEXT:    s_add_i32 s0, s0, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s6
-; GFX8-NEXT:    s_max_i32 s14, s3, s15
-; GFX8-NEXT:    s_min_i32 s3, s3, s15
-; GFX8-NEXT:    s_sub_i32 s3, s13, s3
+; GFX8-NEXT:    s_max_i32 s12, s3, s13
+; GFX8-NEXT:    s_min_i32 s3, s3, s13
+; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
-; GFX8-NEXT:    s_sub_i32 s14, s12, s14
+; GFX8-NEXT:    s_sub_i32 s12, 0x7fff, s12
 ; GFX8-NEXT:    s_max_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sext_i32_i16 s9, s14
+; GFX8-NEXT:    s_sext_i32_i16 s9, s12
 ; GFX8-NEXT:    s_min_i32 s3, s3, s9
 ; GFX8-NEXT:    s_add_i32 s6, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s1
-; GFX8-NEXT:    s_max_i32 s9, s3, s15
-; GFX8-NEXT:    s_min_i32 s3, s3, s15
-; GFX8-NEXT:    s_sub_i32 s3, s13, s3
+; GFX8-NEXT:    s_max_i32 s9, s3, s13
+; GFX8-NEXT:    s_min_i32 s3, s3, s13
+; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sub_i32 s9, s12, s9
+; GFX8-NEXT:    s_sub_i32 s9, 0x7fff, s9
 ; GFX8-NEXT:    s_max_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s9
@@ -3576,25 +3515,25 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_add_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s7
-; GFX8-NEXT:    s_max_i32 s4, s3, s15
-; GFX8-NEXT:    s_min_i32 s3, s3, s15
-; GFX8-NEXT:    s_sub_i32 s3, s13, s3
+; GFX8-NEXT:    s_max_i32 s4, s3, s13
+; GFX8-NEXT:    s_min_i32 s3, s3, s13
+; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s10
-; GFX8-NEXT:    s_sub_i32 s4, s12, s4
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
 ; GFX8-NEXT:    s_max_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_add_i32 s7, s7, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s2
-; GFX8-NEXT:    s_max_i32 s4, s3, s15
-; GFX8-NEXT:    s_min_i32 s3, s3, s15
-; GFX8-NEXT:    s_sub_i32 s3, s13, s3
+; GFX8-NEXT:    s_max_i32 s4, s3, s13
+; GFX8-NEXT:    s_min_i32 s3, s3, s13
+; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_sub_i32 s4, s12, s4
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
 ; GFX8-NEXT:    s_max_i32 s3, s3, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
@@ -3602,12 +3541,12 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s8
-; GFX8-NEXT:    s_max_i32 s4, s3, s15
-; GFX8-NEXT:    s_min_i32 s3, s3, s15
-; GFX8-NEXT:    s_sub_i32 s3, s13, s3
+; GFX8-NEXT:    s_max_i32 s4, s3, s13
+; GFX8-NEXT:    s_min_i32 s3, s3, s13
+; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s11
-; GFX8-NEXT:    s_sub_i32 s4, s12, s4
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
 ; GFX8-NEXT:    s_max_i32 s3, s3, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
@@ -3861,31 +3800,29 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-LABEL: s_saddsat_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s17, 1
-; GFX6-NEXT:    s_min_i32 s19, s0, 0
+; GFX6-NEXT:    s_min_i32 s17, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_brev_b32 s16, -2
-; GFX6-NEXT:    s_max_i32 s18, s0, 0
-; GFX6-NEXT:    s_sub_i32 s19, s17, s19
-; GFX6-NEXT:    s_sub_i32 s18, s16, s18
-; GFX6-NEXT:    s_max_i32 s8, s19, s8
+; GFX6-NEXT:    s_max_i32 s16, s0, 0
+; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
+; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
+; GFX6-NEXT:    s_max_i32 s8, s17, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_min_i32 s8, s8, s18
-; GFX6-NEXT:    s_min_i32 s18, s1, 0
+; GFX6-NEXT:    s_min_i32 s8, s8, s16
+; GFX6-NEXT:    s_min_i32 s16, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
 ; GFX6-NEXT:    s_max_i32 s9, s1, 0
-; GFX6-NEXT:    s_sub_i32 s18, s17, s18
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_max_i32 s8, s18, s8
+; GFX6-NEXT:    s_sub_i32 s16, 0x80000000, s16
+; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
+; GFX6-NEXT:    s_max_i32 s8, s16, s8
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_add_i32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
 ; GFX6-NEXT:    s_min_i32 s10, s2, 0
 ; GFX6-NEXT:    s_max_i32 s9, s2, 0
-; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
+; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
@@ -3893,8 +3830,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_add_i32 s2, s2, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
 ; GFX6-NEXT:    s_max_i32 s9, s3, 0
-; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
+; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
@@ -3902,8 +3839,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_add_i32 s3, s3, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
 ; GFX6-NEXT:    s_max_i32 s9, s4, 0
-; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
+; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
@@ -3911,8 +3848,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_add_i32 s4, s4, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
 ; GFX6-NEXT:    s_max_i32 s9, s5, 0
-; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
+; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
@@ -3920,86 +3857,83 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_add_i32 s5, s5, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
 ; GFX6-NEXT:    s_max_i32 s9, s6, 0
-; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
+; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_min_i32 s10, s7, 0
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_add_i32 s6, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
 ; GFX6-NEXT:    s_max_i32 s9, s7, 0
-; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_max_i32 s8, s10, s8
-; GFX6-NEXT:    s_min_i32 s8, s8, s9
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_add_i32 s7, s7, s8
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
+; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
+; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
+; GFX6-NEXT:    s_max_i32 s8, s10, s8
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s0, s0, s8
+; GFX6-NEXT:    s_min_i32 s8, s8, s9
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX6-NEXT:    s_add_i32 s7, s7, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s8
-; GFX6-NEXT:    s_and_b32 s2, s3, s8
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s8
+; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s8
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s3, s6, s8
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s18, s0
-; GFX8-NEXT:    s_sext_i32_i16 s19, 0
-; GFX8-NEXT:    s_movk_i32 s17, 0x8000
-; GFX8-NEXT:    s_max_i32 s20, s18, s19
-; GFX8-NEXT:    s_min_i32 s18, s18, s19
-; GFX8-NEXT:    s_sub_i32 s18, s17, s18
+; GFX8-NEXT:    s_sext_i32_i16 s16, s0
+; GFX8-NEXT:    s_sext_i32_i16 s17, 0
+; GFX8-NEXT:    s_max_i32 s18, s16, s17
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
+; GFX8-NEXT:    s_sub_i32 s16, 0xffff8000, s16
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
-; GFX8-NEXT:    s_sext_i32_i16 s18, s18
+; GFX8-NEXT:    s_sext_i32_i16 s16, s16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sub_i32 s20, s16, s20
-; GFX8-NEXT:    s_max_i32 s4, s18, s4
+; GFX8-NEXT:    s_sub_i32 s18, 0x7fff, s18
+; GFX8-NEXT:    s_max_i32 s4, s16, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sext_i32_i16 s18, s20
+; GFX8-NEXT:    s_sext_i32_i16 s16, s18
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_min_i32 s4, s4, s18
+; GFX8-NEXT:    s_min_i32 s4, s4, s16
 ; GFX8-NEXT:    s_add_i32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s8
-; GFX8-NEXT:    s_max_i32 s18, s4, s19
-; GFX8-NEXT:    s_min_i32 s4, s4, s19
-; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_max_i32 s16, s4, s17
+; GFX8-NEXT:    s_min_i32 s4, s4, s17
+; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
-; GFX8-NEXT:    s_sub_i32 s18, s16, s18
+; GFX8-NEXT:    s_sub_i32 s16, 0x7fff, s16
 ; GFX8-NEXT:    s_max_i32 s4, s4, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sext_i32_i16 s12, s18
+; GFX8-NEXT:    s_sext_i32_i16 s12, s16
 ; GFX8-NEXT:    s_min_i32 s4, s4, s12
 ; GFX8-NEXT:    s_add_i32 s8, s8, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    s_max_i32 s12, s4, s19
-; GFX8-NEXT:    s_min_i32 s4, s4, s19
-; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_max_i32 s12, s4, s17
+; GFX8-NEXT:    s_min_i32 s4, s4, s17
+; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
 ; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_sub_i32 s12, s16, s12
+; GFX8-NEXT:    s_sub_i32 s12, 0x7fff, s12
 ; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s12
@@ -4007,25 +3941,25 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s1, s1, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s9
-; GFX8-NEXT:    s_max_i32 s5, s4, s19
-; GFX8-NEXT:    s_min_i32 s4, s4, s19
-; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_max_i32 s5, s4, s17
+; GFX8-NEXT:    s_min_i32 s4, s4, s17
+; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s13
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
 ; GFX8-NEXT:    s_max_i32 s4, s4, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s9, s9, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s2
-; GFX8-NEXT:    s_max_i32 s5, s4, s19
-; GFX8-NEXT:    s_min_i32 s4, s4, s19
-; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_max_i32 s5, s4, s17
+; GFX8-NEXT:    s_min_i32 s4, s4, s17
+; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
 ; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
@@ -4033,24 +3967,24 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s10
-; GFX8-NEXT:    s_max_i32 s5, s4, s19
-; GFX8-NEXT:    s_min_i32 s4, s4, s19
-; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_max_i32 s5, s4, s17
+; GFX8-NEXT:    s_min_i32 s4, s4, s17
+; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s14
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
 ; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s10, s10, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s3
-; GFX8-NEXT:    s_max_i32 s5, s4, s19
-; GFX8-NEXT:    s_min_i32 s4, s4, s19
-; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_max_i32 s5, s4, s17
+; GFX8-NEXT:    s_min_i32 s4, s4, s17
+; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s7
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
 ; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
@@ -4058,13 +3992,13 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s11
-; GFX8-NEXT:    s_max_i32 s5, s4, s19
-; GFX8-NEXT:    s_min_i32 s4, s4, s19
+; GFX8-NEXT:    s_max_i32 s5, s4, s17
+; GFX8-NEXT:    s_min_i32 s4, s4, s17
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
-; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s15
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
 ; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
@@ -4553,13 +4487,12 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX6-NEXT:    s_mov_b32 s10, 0
+; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
-; GFX6-NEXT:    s_addc_u32 s1, s4, s5
+; GFX6-NEXT:    s_addc_u32 s1, s4, 0x80000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX6-NEXT:    s_add_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
@@ -4571,10 +4504,10 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_addc_u32 s3, s4, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    s_addc_u32 s3, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
@@ -4596,13 +4529,12 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX8-NEXT:    s_mov_b32 s10, 0
+; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    s_addc_u32 s1, s4, s5
+; GFX8-NEXT:    s_addc_u32 s1, s4, 0x80000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX8-NEXT:    s_add_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -4614,10 +4546,10 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX8-NEXT:    s_mov_b32 s6, 0
+; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX8-NEXT:    s_addc_u32 s3, s4, s5
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX8-NEXT:    s_addc_u32 s3, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
@@ -4639,13 +4571,12 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_mov_b32 s10, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT:    s_brev_b32 s5, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    s_addc_u32 s1, s4, s5
+; GFX9-NEXT:    s_addc_u32 s1, s4, 0x80000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX9-NEXT:    s_add_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -4657,10 +4588,10 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_mov_b32 s6, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_addc_u32 s3, s4, s5
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX9-NEXT:    s_addc_u32 s3, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
@@ -4679,15 +4610,14 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[4:5], 0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
-; GFX10-NEXT:    s_mov_b32 s11, 0
+; GFX10-NEXT:    s_mov_b32 s10, 0
 ; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_brev_b32 s10, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-NEXT:    s_xor_b32 s8, s4, s1
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s8
-; GFX10-NEXT:    s_addc_u32 s1, s0, s10
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0x80000000
 ; GFX10-NEXT:    s_add_u32 s4, s2, s6
 ; GFX10-NEXT:    s_addc_u32 s5, s3, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
@@ -4697,9 +4627,9 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
 ; GFX10-NEXT:    s_xor_b32 s2, s3, s2
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
-; GFX10-NEXT:    s_addc_u32 s1, s0, s10
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
@@ -5453,10 +5383,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_addc_u32 s1, s0, 0
-; GFX6-NEXT:    s_brev_b32 s10, 1
 ; GFX6-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s3, s0, s10
+; GFX6-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
@@ -5495,7 +5424,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX6-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s7, s4, s10
+; GFX6-NEXT:    s_addc_u32 s7, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s0
@@ -5550,10 +5479,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_addc_u32 s1, s0, 0
-; GFX8-NEXT:    s_brev_b32 s10, 1
 ; GFX8-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s3, s0, s10
+; GFX8-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s8
@@ -5598,7 +5526,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX8-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s7, s4, s10
+; GFX8-NEXT:    s_addc_u32 s7, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
@@ -5653,10 +5581,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_addc_u32 s1, s0, 0
-; GFX9-NEXT:    s_brev_b32 s10, 1
 ; GFX9-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s3, s0, s10
+; GFX9-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s8
@@ -5701,7 +5628,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX9-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s7, s4, s10
+; GFX9-NEXT:    s_addc_u32 s7, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
@@ -5747,14 +5674,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    s_ashr_i32 s2, s17, 31
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT:    s_brev_b32 s11, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s2, 0
 ; GFX10-NEXT:    s_addc_u32 s10, s2, 0
-; GFX10-NEXT:    s_addc_u32 s3, s2, s11
+; GFX10-NEXT:    s_addc_u32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_add_u32 s12, s4, s12
 ; GFX10-NEXT:    s_addc_u32 s13, s5, s13
 ; GFX10-NEXT:    s_addc_u32 s18, s6, s14
@@ -5795,7 +5721,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX10-NEXT:    s_addc_u32 s2, s0, 0
-; GFX10-NEXT:    s_addc_u32 s3, s0, s11
+; GFX10-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s0, vcc_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index af008f0cabdef..2599327910eb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -309,10 +309,10 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 12, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CHECK-NEXT:    v_subrev_i32_e64 v3, s[4:5], s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
@@ -408,15 +408,15 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; CGP-NEXT:    v_subrev_i32_e64 v7, s[4:5], s8, v0
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v8, s[6:7], v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
+; CGP-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, 0x1000, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
@@ -450,10 +450,10 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v2, s6
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CHECK-NEXT:    v_subrev_i32_e64 v3, s[4:5], s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
@@ -549,15 +549,15 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; CGP-NEXT:    v_subrev_i32_e64 v7, s[4:5], s8, v0
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v8, s[6:7], v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
+; CGP-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index d4378da215ee5..f92b02bd7b6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1186,9 +1186,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_movk_i32 s10, 0x1000
 ; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
+; GISEL-NEXT:    s_add_u32 s4, 0x1000, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
@@ -1317,7 +1316,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
+; GISEL-NEXT:    s_add_u32 s4, 0x1000, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
@@ -1890,9 +1889,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_sdiv_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s10, 0x12d8fb
 ; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
+; GISEL-NEXT:    s_add_u32 s4, 0x12d8fb, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
@@ -2021,7 +2019,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
+; GISEL-NEXT:    s_add_u32 s4, 0x12d8fb, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 5d773c3d9c5ea..3120da6cfc4c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1157,121 +1157,120 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX10-LABEL: sdivrem_v4i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0x4f7ffffe
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_ashr_i32 s0, s12, 31
-; GFX10-NEXT:    s_ashr_i32 s2, s14, 31
-; GFX10-NEXT:    s_add_i32 s6, s12, s0
-; GFX10-NEXT:    s_add_i32 s12, s14, s2
-; GFX10-NEXT:    s_xor_b32 s14, s6, s0
 ; GFX10-NEXT:    s_ashr_i32 s1, s13, 31
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX10-NEXT:    s_ashr_i32 s2, s14, 31
 ; GFX10-NEXT:    s_ashr_i32 s3, s15, 31
+; GFX10-NEXT:    s_add_i32 s6, s12, s0
 ; GFX10-NEXT:    s_add_i32 s7, s13, s1
+; GFX10-NEXT:    s_add_i32 s12, s14, s2
 ; GFX10-NEXT:    s_add_i32 s13, s15, s3
+; GFX10-NEXT:    s_xor_b32 s14, s6, s0
 ; GFX10-NEXT:    s_xor_b32 s15, s7, s1
 ; GFX10-NEXT:    s_xor_b32 s12, s12, s2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX10-NEXT:    s_xor_b32 s13, s13, s3
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s12
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s13
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_xor_b32 s13, s13, s3
 ; GFX10-NEXT:    s_sub_i32 s6, 0, s14
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s13
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX10-NEXT:    s_sub_i32 s7, 0, s15
 ; GFX10-NEXT:    s_sub_i32 s19, 0, s12
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX10-NEXT:    s_ashr_i32 s16, s8, 31
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT:    s_ashr_i32 s18, s10, 31
 ; GFX10-NEXT:    s_ashr_i32 s17, s9, 31
+; GFX10-NEXT:    s_ashr_i32 s18, s10, 31
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    s_xor_b32 s20, s16, s0
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_xor_b32 s21, s17, s1
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v0
 ; GFX10-NEXT:    s_sub_i32 s6, 0, s13
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s7, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s19, v2
+; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; GFX10-NEXT:    s_ashr_i32 s19, s11, 31
+; GFX10-NEXT:    s_add_i32 s6, s8, s16
 ; GFX10-NEXT:    s_add_i32 s7, s9, s17
-; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v4
-; GFX10-NEXT:    s_add_i32 s6, s8, s16
 ; GFX10-NEXT:    s_add_i32 s8, s10, s18
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GFX10-NEXT:    s_xor_b32 s10, s6, s16
-; GFX10-NEXT:    s_add_i32 s9, s11, s19
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v4
+; GFX10-NEXT:    s_add_i32 s9, s11, s19
+; GFX10-NEXT:    s_xor_b32 s10, s6, s16
 ; GFX10-NEXT:    s_xor_b32 s11, s7, s17
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v4
 ; GFX10-NEXT:    s_xor_b32 s8, s8, s18
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v6
-; GFX10-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX10-NEXT:    s_xor_b32 s9, s9, s19
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v7
+; GFX10-NEXT:    s_xor_b32 s9, s9, s19
+; GFX10-NEXT:    v_mul_hi_u32 v0, s10, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v3, s9, v3
 ; GFX10-NEXT:    s_xor_b32 s22, s18, s2
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_mul_hi_u32 v3, s9, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v4, v0, s14
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v1, s15
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v2, s12
+; GFX10-NEXT:    v_mul_lo_u32 v7, v3, s13
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
-; GFX10-NEXT:    v_mul_lo_u32 v7, v3, s13
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s10, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s10, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s11, v5
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s8, v6
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s9, v7
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v4
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v5
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s12, v6
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s13, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s14, v4
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s13, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s15, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s12, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s13, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v4
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s14, v4
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s12, v6
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s13, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s14, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s15, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s12, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s13, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s13, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
 ; GFX10-NEXT:    s_xor_b32 s0, s19, s3
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s20, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s21, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s22, v2
@@ -2817,7 +2816,6 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-LABEL: sdivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
-; GFX8-NEXT:    s_mov_b32 s10, 0x100010
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s3
 ; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
@@ -2825,41 +2823,41 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    s_xor_b32 s9, s0, s8
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX8-NEXT:    s_sub_i32 s6, 0, s9
-; GFX8-NEXT:    s_sext_i32_i16 s0, s2
-; GFX8-NEXT:    s_bfe_i32 s1, s3, s10
+; GFX8-NEXT:    s_bfe_i32 s1, s3, 0x100010
+; GFX8-NEXT:    s_ashr_i32 s10, s1, 31
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
-; GFX8-NEXT:    s_ashr_i32 s11, s1, 31
-; GFX8-NEXT:    s_add_i32 s0, s0, s3
+; GFX8-NEXT:    s_add_i32 s1, s1, s10
+; GFX8-NEXT:    s_xor_b32 s11, s1, s10
+; GFX8-NEXT:    s_sext_i32_i16 s0, s2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_add_i32 s1, s1, s11
-; GFX8-NEXT:    s_xor_b32 s0, s0, s3
-; GFX8-NEXT:    s_xor_b32 s12, s1, s11
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s11
+; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
+; GFX8-NEXT:    s_add_i32 s0, s0, s3
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s12
+; GFX8-NEXT:    s_xor_b32 s0, s0, s3
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s9
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s9, v2
-; GFX8-NEXT:    s_sub_i32 s1, 0, s12
+; GFX8-NEXT:    s_sub_i32 s1, 0, s11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v1
-; GFX8-NEXT:    s_bfe_i32 s1, s2, s10
+; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x100010
 ; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
@@ -2870,19 +2868,19 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s3, v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s12
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
-; GFX8-NEXT:    s_xor_b32 s0, s2, s11
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
+; GFX8-NEXT:    s_xor_b32 s0, s2, s10
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
@@ -2907,45 +2905,44 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX9-LABEL: sdivrem_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-NEXT:    s_mov_b32 s10, 0x100010
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_sext_i32_i16 s0, s7
 ; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
 ; GFX9-NEXT:    s_add_i32 s0, s0, s8
 ; GFX9-NEXT:    s_xor_b32 s9, s0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    s_bfe_i32 s1, s7, s10
-; GFX9-NEXT:    s_ashr_i32 s7, s1, 31
-; GFX9-NEXT:    s_add_i32 s1, s1, s7
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x100010
+; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s11, s1, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX9-NEXT:    s_sub_i32 s1, 0, s9
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_xor_b32 s5, s5, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s10, 0, s9
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_sext_i32_i16 s0, s6
-; GFX9-NEXT:    s_ashr_i32 s12, s0, 31
-; GFX9-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX9-NEXT:    s_sext_i32_i16 s4, s6
+; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v0
+; GFX9-NEXT:    s_ashr_i32 s10, s4, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s0, s0, s12
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_xor_b32 s13, s0, s12
-; GFX9-NEXT:    s_sub_i32 s0, 0, s11
+; GFX9-NEXT:    s_add_i32 s4, s4, s10
+; GFX9-NEXT:    s_xor_b32 s4, s4, s10
+; GFX9-NEXT:    s_sub_i32 s11, 0, s5
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_bfe_i32 s4, s6, s10
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v1
+; GFX9-NEXT:    s_bfe_i32 s6, s6, 0x100010
+; GFX9-NEXT:    s_ashr_i32 s11, s6, 31
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s9
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX9-NEXT:    s_ashr_i32 s5, s4, 31
-; GFX9-NEXT:    s_add_i32 s4, s4, s5
-; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
+; GFX9-NEXT:    s_add_i32 s6, s6, s11
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GFX9-NEXT:    s_xor_b32 s4, s4, s5
+; GFX9-NEXT:    s_xor_b32 s4, s6, s11
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s9, v3
@@ -2956,28 +2953,28 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s5
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    s_xor_b32 s6, s12, s8
+; GFX9-NEXT:    s_xor_b32 s6, s10, s8
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    s_xor_b32 s4, s5, s7
-; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
+; GFX9-NEXT:    s_xor_b32 s4, s11, s7
+; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
+; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s12, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
 ; GFX9-NEXT:    v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u32_sdwa v3, v3, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v4, v3
@@ -2990,19 +2987,18 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX10-LABEL: sdivrem_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX10-NEXT:    s_mov_b32 s2, 0x100010
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s3, s1
-; GFX10-NEXT:    s_bfe_i32 s1, s1, s2
-; GFX10-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX10-NEXT:    s_ashr_i32 s9, s1, 31
-; GFX10-NEXT:    s_add_i32 s3, s3, s8
-; GFX10-NEXT:    s_add_i32 s1, s1, s9
-; GFX10-NEXT:    s_xor_b32 s3, s3, s8
-; GFX10-NEXT:    s_xor_b32 s1, s1, s9
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX10-NEXT:    s_sext_i32_i16 s2, s1
+; GFX10-NEXT:    s_bfe_i32 s1, s1, 0x100010
+; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX10-NEXT:    s_ashr_i32 s8, s1, 31
+; GFX10-NEXT:    s_add_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s1, s1, s8
+; GFX10-NEXT:    s_xor_b32 s2, s2, s3
+; GFX10-NEXT:    s_xor_b32 s1, s1, s8
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GFX10-NEXT:    s_sub_i32 s6, 0, s3
+; GFX10-NEXT:    s_sub_i32 s6, 0, s2
 ; GFX10-NEXT:    s_sub_i32 s7, 0, s1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
@@ -3013,29 +3009,29 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
 ; GFX10-NEXT:    s_sext_i32_i16 s6, s0
-; GFX10-NEXT:    s_bfe_i32 s0, s0, s2
-; GFX10-NEXT:    s_ashr_i32 s2, s6, 31
+; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x100010
+; GFX10-NEXT:    s_ashr_i32 s9, s6, 31
 ; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
-; GFX10-NEXT:    s_add_i32 s6, s6, s2
+; GFX10-NEXT:    s_add_i32 s6, s6, s9
 ; GFX10-NEXT:    s_add_i32 s0, s0, s10
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX10-NEXT:    s_xor_b32 s6, s6, s2
+; GFX10-NEXT:    s_xor_b32 s6, s6, s9
 ; GFX10-NEXT:    s_xor_b32 s0, s0, s10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s3, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
@@ -3043,28 +3039,27 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s3, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
-; GFX10-NEXT:    s_xor_b32 s1, s2, s8
+; GFX10-NEXT:    s_xor_b32 s1, s9, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s10, s9
+; GFX10-NEXT:    s_xor_b32 s0, s10, s8
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v2
+; GFX10-NEXT:    v_xor_b32_e32 v2, s9, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s2, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s9, v2
 ; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v4, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v4, v3
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v2, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    global_store_dword v1, v2, s[6:7]
@@ -3366,9 +3361,8 @@ define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s7, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s7, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0x7ffffff
-; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x7ffffff, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    global_store_dword v2, v1, s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 7e5ecaac8d9c7..97a09585b73e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -439,9 +439,8 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    s_brev_b32 s4, -4
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0x3fffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0x3fffffff, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[4:5]
@@ -523,9 +522,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_brev_b32 s4, -8
-; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0x1fffffff, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
@@ -608,13 +606,12 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
 define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 ; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX7-NEXT:    s_and_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_or_b32 s0, s1, s0
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0x3fff3fff
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX7-NEXT:    s_and_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX7-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index acf525adeeff7..15173a7d5cea0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -785,25 +785,23 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
 define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: s_shl_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s3
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_shl_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -869,10 +867,10 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: shl_v2i16_vs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s1, s2
+; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
@@ -970,39 +968,37 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
 ; GFX6-LABEL: s_shl_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s7
-; GFX6-NEXT:    s_and_b32 s0, s0, s8
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s8
-; GFX6-NEXT:    s_and_b32 s2, s3, s8
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_shl_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s6
-; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s6
-; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshl_b32 s2, s4, s7
+; GFX8-NEXT:    s_lshl_b32 s2, s4, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s3
-; GFX8-NEXT:    s_lshl_b32 s3, s5, s8
+; GFX8-NEXT:    s_lshl_b32 s3, s5, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s2, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s6
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1144,67 +1140,65 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
 ; GFX6-LABEL: s_shl_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s16, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s9
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s8
-; GFX6-NEXT:    s_and_b32 s1, s1, s16
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s10
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s11
-; GFX6-NEXT:    s_and_b32 s0, s0, s16
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, s13
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s16
-; GFX6-NEXT:    s_and_b32 s2, s3, s16
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, s12
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, s15
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s16
+; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, s14
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s16
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, s16
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s3, s6, s16
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_shl_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s12, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s12
-; GFX8-NEXT:    s_lshr_b32 s13, s4, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s12
-; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX8-NEXT:    s_lshl_b32 s4, s8, s13
+; GFX8-NEXT:    s_lshl_b32 s4, s8, s12
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, s12
-; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
-; GFX8-NEXT:    s_lshl_b32 s5, s9, s14
+; GFX8-NEXT:    s_lshl_b32 s5, s9, s13
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s12
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, s12
-; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
+; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s6
-; GFX8-NEXT:    s_lshl_b32 s6, s10, s15
+; GFX8-NEXT:    s_lshl_b32 s6, s10, s14
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, s12
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, s7
-; GFX8-NEXT:    s_lshl_b32 s7, s11, s16
+; GFX8-NEXT:    s_lshl_b32 s7, s11, s15
 ; GFX8-NEXT:    s_or_b32 s1, s4, s1
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, s12
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX8-NEXT:    s_or_b32 s2, s4, s2
 ; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, s12
+; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX8-NEXT:    s_or_b32 s3, s4, s3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index a85173ac78df8..f4eb1437cce30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -377,13 +377,13 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v1, v5
+; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x1000, v1
 ; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v1, v5
+; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x1000, v1
 ; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
@@ -508,13 +508,13 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v1, v5
+; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
 ; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v1, v5
+; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
 ; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 0cddf3e2c86ef..33d88ad985283 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -1164,9 +1164,8 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_srem_v2i64_pow2k_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_movk_i32 s10, 0x1000
 ; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
+; GISEL-NEXT:    s_add_u32 s4, 0x1000, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
@@ -1294,7 +1293,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s8, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
+; GISEL-NEXT:    s_add_u32 s4, 0x1000, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
@@ -1860,9 +1859,8 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_srem_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s10, 0x12d8fb
 ; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
+; GISEL-NEXT:    s_add_u32 s4, 0x12d8fb, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
@@ -1990,7 +1988,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s8, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
+; GISEL-NEXT:    s_add_u32 s4, 0x12d8fb, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 38599949d7777..95e1c42572dd3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -320,12 +320,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
@@ -345,31 +344,28 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_max_i32 s6, s0, -1
+; GFX6-NEXT:    s_max_i32 s4, s0, -1
 ; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s4
-; GFX6-NEXT:    s_min_i32 s7, s0, -1
-; GFX6-NEXT:    s_sub_i32 s7, s7, s5
-; GFX6-NEXT:    s_max_i32 s1, s6, s1
-; GFX6-NEXT:    s_min_i32 s1, s1, s7
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s5, s0, -1
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX6-NEXT:    s_max_i32 s1, s4, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s5
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX6-NEXT:    s_max_i32 s3, s1, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s4, s1, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s3, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
-; GFX6-NEXT:    s_movk_i32 s2, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
-; GFX6-NEXT:    s_and_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -379,32 +375,30 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_bfe_u32 s4, 8, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX8-NEXT:    s_sext_i32_i16 s7, s0
-; GFX8-NEXT:    s_sext_i32_i16 s8, -1
-; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX8-NEXT:    s_max_i32 s9, s7, s8
+; GFX8-NEXT:    s_sext_i32_i16 s5, s0
+; GFX8-NEXT:    s_sext_i32_i16 s6, -1
+; GFX8-NEXT:    s_max_i32 s7, s5, s6
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX8-NEXT:    s_sub_i32 s9, s9, s5
-; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    s_min_i32 s7, s7, s8
-; GFX8-NEXT:    s_sext_i32_i16 s9, s9
+; GFX8-NEXT:    s_sub_i32 s7, s7, 0x7fff
+; GFX8-NEXT:    s_min_i32 s5, s5, s6
+; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s7, s7, s6
-; GFX8-NEXT:    s_max_i32 s1, s9, s1
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s1, s7, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s7, s7
-; GFX8-NEXT:    s_min_i32 s1, s1, s7
+; GFX8-NEXT:    s_sext_i32_i16 s5, s5
+; GFX8-NEXT:    s_min_i32 s1, s1, s5
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s1
-; GFX8-NEXT:    s_max_i32 s7, s3, s8
-; GFX8-NEXT:    s_sub_i32 s5, s7, s5
-; GFX8-NEXT:    s_min_i32 s3, s3, s8
+; GFX8-NEXT:    s_max_i32 s5, s3, s6
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fff
+; GFX8-NEXT:    s_min_i32 s3, s3, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s3, s3, s6
+; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s2, s5, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
@@ -413,10 +407,9 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s4
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -424,17 +417,16 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-LABEL: s_ssubsat_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX9-NEXT:    s_mov_b32 s2, 0x80008
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX9-NEXT:    s_lshl_b32 s2, s3, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
@@ -451,15 +443,14 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s2, s4, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
@@ -639,35 +630,33 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v7
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v3, v6, v2, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v8, v2, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v6
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v5, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v7, v4
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_pk_sub_i16 v1, v3, v2 clamp
+; GFX10-NEXT:    v_pk_sub_i16 v1, v2, v3 clamp
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -685,59 +674,56 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_max_i32 s10, s0, -1
+; GFX6-NEXT:    s_max_i32 s8, s0, -1
 ; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s8
-; GFX6-NEXT:    s_min_i32 s11, s0, -1
-; GFX6-NEXT:    s_sub_i32 s11, s11, s9
-; GFX6-NEXT:    s_max_i32 s1, s10, s1
-; GFX6-NEXT:    s_min_i32 s1, s1, s11
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s9, s0, -1
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x80000000
+; GFX6-NEXT:    s_max_i32 s1, s8, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s9
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_min_i32 s10, s1, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s9
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s8, s1, -1
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s5, s2
-; GFX6-NEXT:    s_min_i32 s2, s2, s10
+; GFX6-NEXT:    s_min_i32 s2, s2, s8
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
-; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s6, s2, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s9
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s3, s5, s3
 ; GFX6-NEXT:    s_min_i32 s3, s3, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
 ; GFX6-NEXT:    s_max_i32 s5, s3, -1
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
-; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s6, s3, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s9
-; GFX6-NEXT:    s_max_i32 s4, s5, s4
-; GFX6-NEXT:    s_min_i32 s4, s4, s6
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
-; GFX6-NEXT:    s_sub_i32 s3, s3, s4
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
+; GFX6-NEXT:    s_max_i32 s4, s5, s4
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s6
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s4
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, s4
+; GFX6-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -749,35 +735,33 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
-; GFX8-NEXT:    s_sext_i32_i16 s11, s0
-; GFX8-NEXT:    s_sext_i32_i16 s12, -1
-; GFX8-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX8-NEXT:    s_max_i32 s13, s11, s12
+; GFX8-NEXT:    s_sext_i32_i16 s9, s0
+; GFX8-NEXT:    s_sext_i32_i16 s10, -1
+; GFX8-NEXT:    s_max_i32 s11, s9, s10
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
-; GFX8-NEXT:    s_sub_i32 s13, s13, s9
-; GFX8-NEXT:    s_movk_i32 s10, 0x8000
-; GFX8-NEXT:    s_min_i32 s11, s11, s12
-; GFX8-NEXT:    s_sext_i32_i16 s13, s13
+; GFX8-NEXT:    s_sub_i32 s11, s11, 0x7fff
+; GFX8-NEXT:    s_min_i32 s9, s9, s10
+; GFX8-NEXT:    s_sext_i32_i16 s11, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s11, s11, s10
-; GFX8-NEXT:    s_max_i32 s1, s13, s1
+; GFX8-NEXT:    s_sub_i32 s9, s9, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s1, s11, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s11, s11
-; GFX8-NEXT:    s_min_i32 s1, s1, s11
+; GFX8-NEXT:    s_sext_i32_i16 s9, s9
+; GFX8-NEXT:    s_min_i32 s1, s1, s9
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s5, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s1
-; GFX8-NEXT:    s_max_i32 s11, s5, s12
-; GFX8-NEXT:    s_sub_i32 s11, s11, s9
-; GFX8-NEXT:    s_min_i32 s5, s5, s12
-; GFX8-NEXT:    s_sext_i32_i16 s11, s11
+; GFX8-NEXT:    s_max_i32 s9, s5, s10
+; GFX8-NEXT:    s_sub_i32 s9, s9, 0x7fff
+; GFX8-NEXT:    s_min_i32 s5, s5, s10
+; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
-; GFX8-NEXT:    s_max_i32 s2, s11, s2
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s2, s9, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_min_i32 s2, s2, s5
@@ -785,12 +769,12 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX8-NEXT:    s_lshl_b32 s3, s6, s8
-; GFX8-NEXT:    s_max_i32 s6, s5, s12
-; GFX8-NEXT:    s_sub_i32 s6, s6, s9
-; GFX8-NEXT:    s_min_i32 s5, s5, s12
+; GFX8-NEXT:    s_max_i32 s6, s5, s10
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_min_i32 s5, s5, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
@@ -798,35 +782,34 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s4, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s3
-; GFX8-NEXT:    s_max_i32 s6, s5, s12
+; GFX8-NEXT:    s_max_i32 s6, s5, s10
 ; GFX8-NEXT:    s_lshl_b32 s4, s7, s8
-; GFX8-NEXT:    s_sub_i32 s6, s6, s9
-; GFX8-NEXT:    s_min_i32 s5, s5, s12
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_min_i32 s5, s5, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
-; GFX8-NEXT:    s_max_i32 s4, s6, s4
-; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s4, s6, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
-; GFX8-NEXT:    s_sub_i32 s3, s3, s4
-; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_sext_i32_i16 s4, s4
+; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s4
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, s8
-; GFX8-NEXT:    s_and_b32 s0, s0, s4
+; GFX8-NEXT:    s_sub_i32 s3, s3, s4
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s2, s4
+; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, s8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, s4
+; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -838,27 +821,26 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_mov_b32 s4, 0x80008
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 0x80008
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
@@ -885,39 +867,37 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_sub_i16 v1, s2, s3 clamp
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_b32_e32 v3, s1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, 24
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -1255,19 +1235,17 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_max_i32 s6, s0, -1
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s4
-; GFX6-NEXT:    s_min_i32 s7, s0, -1
-; GFX6-NEXT:    s_sub_i32 s7, s7, s5
-; GFX6-NEXT:    s_max_i32 s2, s6, s2
-; GFX6-NEXT:    s_min_i32 s2, s2, s7
+; GFX6-NEXT:    s_max_i32 s4, s0, -1
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s5, s0, -1
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX6-NEXT:    s_max_i32 s2, s4, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s5
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX6-NEXT:    s_max_i32 s2, s1, -1
-; GFX6-NEXT:    s_sub_i32 s2, s2, s4
+; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s4, s1, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s2, s3
 ; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
@@ -1275,19 +1253,17 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ;
 ; GFX8-LABEL: s_ssubsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s4, -2
-; GFX8-NEXT:    s_max_i32 s6, s0, -1
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    s_sub_i32 s6, s6, s4
-; GFX8-NEXT:    s_min_i32 s7, s0, -1
-; GFX8-NEXT:    s_sub_i32 s7, s7, s5
-; GFX8-NEXT:    s_max_i32 s2, s6, s2
-; GFX8-NEXT:    s_min_i32 s2, s2, s7
+; GFX8-NEXT:    s_max_i32 s4, s0, -1
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s5, s0, -1
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX8-NEXT:    s_max_i32 s2, s4, s2
+; GFX8-NEXT:    s_min_i32 s2, s2, s5
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX8-NEXT:    s_max_i32 s2, s1, -1
-; GFX8-NEXT:    s_sub_i32 s2, s2, s4
+; GFX8-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s4, s1, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, s5
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s2, s2, s3
 ; GFX8-NEXT:    s_min_i32 s2, s2, s4
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s2
@@ -1394,26 +1370,24 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s6, -2
-; GFX6-NEXT:    s_max_i32 s8, s0, -1
-; GFX6-NEXT:    s_brev_b32 s7, 1
-; GFX6-NEXT:    s_sub_i32 s8, s8, s6
-; GFX6-NEXT:    s_min_i32 s9, s0, -1
-; GFX6-NEXT:    s_sub_i32 s9, s9, s7
-; GFX6-NEXT:    s_max_i32 s3, s8, s3
-; GFX6-NEXT:    s_min_i32 s3, s3, s9
+; GFX6-NEXT:    s_max_i32 s6, s0, -1
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s7, s0, -1
+; GFX6-NEXT:    s_sub_i32 s7, s7, 0x80000000
+; GFX6-NEXT:    s_max_i32 s3, s6, s3
+; GFX6-NEXT:    s_min_i32 s3, s3, s7
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX6-NEXT:    s_max_i32 s3, s1, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, s6
-; GFX6-NEXT:    s_min_i32 s8, s1, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, s7
+; GFX6-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s6, s1, -1
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s3, s3, s4
-; GFX6-NEXT:    s_min_i32 s3, s3, s8
+; GFX6-NEXT:    s_min_i32 s3, s3, s6
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX6-NEXT:    s_max_i32 s3, s2, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, s6
+; GFX6-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s4, s2, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s7
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s3, s3, s5
 ; GFX6-NEXT:    s_min_i32 s3, s3, s4
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
@@ -1421,26 +1395,24 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ;
 ; GFX8-LABEL: s_ssubsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s6, -2
-; GFX8-NEXT:    s_max_i32 s8, s0, -1
-; GFX8-NEXT:    s_brev_b32 s7, 1
-; GFX8-NEXT:    s_sub_i32 s8, s8, s6
-; GFX8-NEXT:    s_min_i32 s9, s0, -1
-; GFX8-NEXT:    s_sub_i32 s9, s9, s7
-; GFX8-NEXT:    s_max_i32 s3, s8, s3
-; GFX8-NEXT:    s_min_i32 s3, s3, s9
+; GFX8-NEXT:    s_max_i32 s6, s0, -1
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s7, s0, -1
+; GFX8-NEXT:    s_sub_i32 s7, s7, 0x80000000
+; GFX8-NEXT:    s_max_i32 s3, s6, s3
+; GFX8-NEXT:    s_min_i32 s3, s3, s7
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX8-NEXT:    s_max_i32 s3, s1, -1
-; GFX8-NEXT:    s_sub_i32 s3, s3, s6
-; GFX8-NEXT:    s_min_i32 s8, s1, -1
-; GFX8-NEXT:    s_sub_i32 s8, s8, s7
+; GFX8-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s6, s1, -1
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s3, s3, s4
-; GFX8-NEXT:    s_min_i32 s3, s3, s8
+; GFX8-NEXT:    s_min_i32 s3, s3, s6
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    s_max_i32 s3, s2, -1
-; GFX8-NEXT:    s_sub_i32 s3, s3, s6
+; GFX8-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s4, s2, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, s7
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s3, s3, s5
 ; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s3
@@ -1568,33 +1540,31 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_max_i32 s10, s0, -1
-; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s8
-; GFX6-NEXT:    s_min_i32 s11, s0, -1
-; GFX6-NEXT:    s_sub_i32 s11, s11, s9
-; GFX6-NEXT:    s_max_i32 s4, s10, s4
-; GFX6-NEXT:    s_min_i32 s4, s4, s11
+; GFX6-NEXT:    s_max_i32 s8, s0, -1
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s9, s0, -1
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x80000000
+; GFX6-NEXT:    s_max_i32 s4, s8, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s9
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX6-NEXT:    s_max_i32 s4, s1, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    s_min_i32 s10, s1, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s9
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s8, s1, -1
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s4, s5
-; GFX6-NEXT:    s_min_i32 s4, s4, s10
+; GFX6-NEXT:    s_min_i32 s4, s4, s8
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX6-NEXT:    s_max_i32 s4, s2, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s8
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s5, s2, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s9
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s4, s6
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
 ; GFX6-NEXT:    s_max_i32 s4, s3, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s8
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s5, s3, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s9
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s4, s7
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
@@ -1602,33 +1572,31 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ;
 ; GFX8-LABEL: s_ssubsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s8, -2
-; GFX8-NEXT:    s_max_i32 s10, s0, -1
-; GFX8-NEXT:    s_brev_b32 s9, 1
-; GFX8-NEXT:    s_sub_i32 s10, s10, s8
-; GFX8-NEXT:    s_min_i32 s11, s0, -1
-; GFX8-NEXT:    s_sub_i32 s11, s11, s9
-; GFX8-NEXT:    s_max_i32 s4, s10, s4
-; GFX8-NEXT:    s_min_i32 s4, s4, s11
+; GFX8-NEXT:    s_max_i32 s8, s0, -1
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s9, s0, -1
+; GFX8-NEXT:    s_sub_i32 s9, s9, 0x80000000
+; GFX8-NEXT:    s_max_i32 s4, s8, s4
+; GFX8-NEXT:    s_min_i32 s4, s4, s9
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX8-NEXT:    s_max_i32 s4, s1, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, s8
-; GFX8-NEXT:    s_min_i32 s10, s1, -1
-; GFX8-NEXT:    s_sub_i32 s10, s10, s9
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s8, s1, -1
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s4, s4, s5
-; GFX8-NEXT:    s_min_i32 s4, s4, s10
+; GFX8-NEXT:    s_min_i32 s4, s4, s8
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX8-NEXT:    s_max_i32 s4, s2, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, s8
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s5, s2, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, s9
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s4
 ; GFX8-NEXT:    s_max_i32 s4, s3, -1
-; GFX8-NEXT:    s_sub_i32 s4, s4, s8
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s5, s3, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, s9
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s4, s4, s7
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s4
@@ -1694,17 +1662,16 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v13
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v8
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v4
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v4
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v13
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
@@ -1738,17 +1705,16 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v3
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v13
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v8
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v4
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v13
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v9
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
@@ -1781,40 +1747,38 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s10, -2
-; GFX6-NEXT:    s_max_i32 s12, s0, -1
-; GFX6-NEXT:    s_brev_b32 s11, 1
-; GFX6-NEXT:    s_sub_i32 s12, s12, s10
-; GFX6-NEXT:    s_min_i32 s13, s0, -1
-; GFX6-NEXT:    s_sub_i32 s13, s13, s11
-; GFX6-NEXT:    s_max_i32 s5, s12, s5
-; GFX6-NEXT:    s_min_i32 s5, s5, s13
+; GFX6-NEXT:    s_max_i32 s10, s0, -1
+; GFX6-NEXT:    s_sub_i32 s10, s10, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s11, s0, -1
+; GFX6-NEXT:    s_sub_i32 s11, s11, 0x80000000
+; GFX6-NEXT:    s_max_i32 s5, s10, s5
+; GFX6-NEXT:    s_min_i32 s5, s5, s11
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s5
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s10
-; GFX6-NEXT:    s_min_i32 s12, s1, -1
-; GFX6-NEXT:    s_sub_i32 s12, s12, s11
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s10, s1, -1
+; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s5, s6
-; GFX6-NEXT:    s_min_i32 s5, s5, s12
+; GFX6-NEXT:    s_min_i32 s5, s5, s10
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX6-NEXT:    s_max_i32 s5, s2, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s10
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s6, s2, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s11
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s5, s7
 ; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX6-NEXT:    s_max_i32 s5, s3, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s10
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s6, s3, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s11
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s5, s8
 ; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s5
 ; GFX6-NEXT:    s_max_i32 s5, s4, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s10
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s6, s4, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s11
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s5, s5, s9
 ; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s5
@@ -1822,40 +1786,38 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ;
 ; GFX8-LABEL: s_ssubsat_v5i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s10, -2
-; GFX8-NEXT:    s_max_i32 s12, s0, -1
-; GFX8-NEXT:    s_brev_b32 s11, 1
-; GFX8-NEXT:    s_sub_i32 s12, s12, s10
-; GFX8-NEXT:    s_min_i32 s13, s0, -1
-; GFX8-NEXT:    s_sub_i32 s13, s13, s11
-; GFX8-NEXT:    s_max_i32 s5, s12, s5
-; GFX8-NEXT:    s_min_i32 s5, s5, s13
+; GFX8-NEXT:    s_max_i32 s10, s0, -1
+; GFX8-NEXT:    s_sub_i32 s10, s10, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s11, s0, -1
+; GFX8-NEXT:    s_sub_i32 s11, s11, 0x80000000
+; GFX8-NEXT:    s_max_i32 s5, s10, s5
+; GFX8-NEXT:    s_min_i32 s5, s5, s11
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s5
 ; GFX8-NEXT:    s_max_i32 s5, s1, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
-; GFX8-NEXT:    s_min_i32 s12, s1, -1
-; GFX8-NEXT:    s_sub_i32 s12, s12, s11
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s10, s1, -1
+; GFX8-NEXT:    s_sub_i32 s10, s10, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s5, s6
-; GFX8-NEXT:    s_min_i32 s5, s5, s12
+; GFX8-NEXT:    s_min_i32 s5, s5, s10
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX8-NEXT:    s_max_i32 s5, s2, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s6, s2, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, s11
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s5, s7
 ; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX8-NEXT:    s_max_i32 s5, s3, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s6, s3, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, s11
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s5, s8
 ; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s5
 ; GFX8-NEXT:    s_max_i32 s5, s4, -1
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s6, s4, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, s11
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s5, s5, s9
 ; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s5
@@ -2197,117 +2159,115 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_brev_b32 s32, -2
-; GFX6-NEXT:    s_max_i32 s34, s0, -1
-; GFX6-NEXT:    s_brev_b32 s33, 1
-; GFX6-NEXT:    s_sub_i32 s34, s34, s32
-; GFX6-NEXT:    s_min_i32 s35, s0, -1
-; GFX6-NEXT:    s_sub_i32 s35, s35, s33
-; GFX6-NEXT:    s_max_i32 s16, s34, s16
-; GFX6-NEXT:    s_min_i32 s16, s16, s35
+; GFX6-NEXT:    s_max_i32 s32, s0, -1
+; GFX6-NEXT:    s_sub_i32 s32, s32, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s33, s0, -1
+; GFX6-NEXT:    s_sub_i32 s33, s33, 0x80000000
+; GFX6-NEXT:    s_max_i32 s16, s32, s16
+; GFX6-NEXT:    s_min_i32 s16, s16, s33
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s16
 ; GFX6-NEXT:    s_max_i32 s16, s1, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_min_i32 s34, s1, -1
-; GFX6-NEXT:    s_sub_i32 s34, s34, s33
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s32, s1, -1
+; GFX6-NEXT:    s_sub_i32 s32, s32, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s17
-; GFX6-NEXT:    s_min_i32 s16, s16, s34
+; GFX6-NEXT:    s_min_i32 s16, s16, s32
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s16
 ; GFX6-NEXT:    s_max_i32 s16, s2, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s2, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s18
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s16
 ; GFX6-NEXT:    s_max_i32 s16, s3, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s3, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s19
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s16
 ; GFX6-NEXT:    s_max_i32 s16, s4, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s4, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s20
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s16
 ; GFX6-NEXT:    s_max_i32 s16, s5, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s5, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s21
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s16
 ; GFX6-NEXT:    s_max_i32 s16, s6, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s6, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s22
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s16
 ; GFX6-NEXT:    s_max_i32 s16, s7, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s7, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s23
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s16
 ; GFX6-NEXT:    s_max_i32 s16, s8, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s8, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s24
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s16
 ; GFX6-NEXT:    s_max_i32 s16, s9, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s9, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s25
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
 ; GFX6-NEXT:    s_max_i32 s16, s10, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s10, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s26
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s16
 ; GFX6-NEXT:    s_max_i32 s16, s11, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s11, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s27
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s16
 ; GFX6-NEXT:    s_max_i32 s16, s12, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s12, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s28
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s12, s12, s16
 ; GFX6-NEXT:    s_max_i32 s16, s13, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s13, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s29
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s13, s13, s16
 ; GFX6-NEXT:    s_max_i32 s16, s14, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s14, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s30
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s14, s14, s16
 ; GFX6-NEXT:    s_max_i32 s16, s15, -1
-; GFX6-NEXT:    s_sub_i32 s16, s16, s32
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s17, s15, -1
-; GFX6-NEXT:    s_sub_i32 s17, s17, s33
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s16, s16, s31
 ; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s15, s15, s16
@@ -2315,117 +2275,115 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ;
 ; GFX8-LABEL: s_ssubsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s32, -2
-; GFX8-NEXT:    s_max_i32 s34, s0, -1
-; GFX8-NEXT:    s_brev_b32 s33, 1
-; GFX8-NEXT:    s_sub_i32 s34, s34, s32
-; GFX8-NEXT:    s_min_i32 s35, s0, -1
-; GFX8-NEXT:    s_sub_i32 s35, s35, s33
-; GFX8-NEXT:    s_max_i32 s16, s34, s16
-; GFX8-NEXT:    s_min_i32 s16, s16, s35
+; GFX8-NEXT:    s_max_i32 s32, s0, -1
+; GFX8-NEXT:    s_sub_i32 s32, s32, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s33, s0, -1
+; GFX8-NEXT:    s_sub_i32 s33, s33, 0x80000000
+; GFX8-NEXT:    s_max_i32 s16, s32, s16
+; GFX8-NEXT:    s_min_i32 s16, s16, s33
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s16
 ; GFX8-NEXT:    s_max_i32 s16, s1, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_min_i32 s34, s1, -1
-; GFX8-NEXT:    s_sub_i32 s34, s34, s33
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s32, s1, -1
+; GFX8-NEXT:    s_sub_i32 s32, s32, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s17
-; GFX8-NEXT:    s_min_i32 s16, s16, s34
+; GFX8-NEXT:    s_min_i32 s16, s16, s32
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s16
 ; GFX8-NEXT:    s_max_i32 s16, s2, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s2, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s18
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s16
 ; GFX8-NEXT:    s_max_i32 s16, s3, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s3, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s19
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s16
 ; GFX8-NEXT:    s_max_i32 s16, s4, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s4, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s20
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s16
 ; GFX8-NEXT:    s_max_i32 s16, s5, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s5, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s21
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s16
 ; GFX8-NEXT:    s_max_i32 s16, s6, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s6, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s22
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s16
 ; GFX8-NEXT:    s_max_i32 s16, s7, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s7, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s23
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s7, s7, s16
 ; GFX8-NEXT:    s_max_i32 s16, s8, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s8, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s24
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s16
 ; GFX8-NEXT:    s_max_i32 s16, s9, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s9, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s25
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s16
 ; GFX8-NEXT:    s_max_i32 s16, s10, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s10, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s26
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s10, s10, s16
 ; GFX8-NEXT:    s_max_i32 s16, s11, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s11, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s27
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s11, s11, s16
 ; GFX8-NEXT:    s_max_i32 s16, s12, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s12, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s28
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s16
 ; GFX8-NEXT:    s_max_i32 s16, s13, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s13, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s29
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s13, s13, s16
 ; GFX8-NEXT:    s_max_i32 s16, s14, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s14, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s30
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s14, s14, s16
 ; GFX8-NEXT:    s_max_i32 s16, s15, -1
-; GFX8-NEXT:    s_sub_i32 s16, s16, s32
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
 ; GFX8-NEXT:    s_min_i32 s17, s15, -1
-; GFX8-NEXT:    s_sub_i32 s17, s17, s33
+; GFX8-NEXT:    s_sub_i32 s17, s17, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s16, s16, s31
 ; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s15, s15, s16
@@ -2767,60 +2725,55 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-LABEL: s_ssubsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_max_i32 s6, s0, -1
+; GFX6-NEXT:    s_max_i32 s4, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s4
-; GFX6-NEXT:    s_min_i32 s7, s0, -1
-; GFX6-NEXT:    s_sub_i32 s7, s7, s5
-; GFX6-NEXT:    s_max_i32 s2, s6, s2
-; GFX6-NEXT:    s_min_i32 s2, s2, s7
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s5, s0, -1
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x80000000
+; GFX6-NEXT:    s_max_i32 s2, s4, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s5
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_max_i32 s3, s1, -1
-; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_sub_i32 s3, s3, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s4, s1, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s4, s4, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s2, s3, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s6, s0
-; GFX8-NEXT:    s_sext_i32_i16 s7, -1
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    s_max_i32 s8, s6, s7
-; GFX8-NEXT:    s_sub_i32 s8, s8, s4
+; GFX8-NEXT:    s_sext_i32_i16 s4, s0
+; GFX8-NEXT:    s_sext_i32_i16 s5, -1
+; GFX8-NEXT:    s_max_i32 s6, s4, s5
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    s_min_i32 s6, s6, s7
-; GFX8-NEXT:    s_sext_i32_i16 s8, s8
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sub_i32 s6, s6, s5
-; GFX8-NEXT:    s_max_i32 s1, s8, s1
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s1, s6, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s6, s6
+; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_min_i32 s1, s1, s6
+; GFX8-NEXT:    s_min_i32 s1, s1, s4
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s2
-; GFX8-NEXT:    s_max_i32 s6, s1, s7
-; GFX8-NEXT:    s_sub_i32 s4, s6, s4
-; GFX8-NEXT:    s_min_i32 s1, s1, s7
+; GFX8-NEXT:    s_max_i32 s4, s1, s5
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_min_i32 s1, s1, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s1, s1, s5
+; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s3, s4, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
@@ -2853,22 +2806,20 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: ssubsat_v2i16_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s2, -2
-; GFX6-NEXT:    s_max_i32 s4, s0, -1
+; GFX6-NEXT:    s_max_i32 s2, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s3, 1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s2
-; GFX6-NEXT:    s_min_i32 s5, s0, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s3
-; GFX6-NEXT:    v_max_i32_e32 v0, s4, v0
-; GFX6-NEXT:    v_min_i32_e32 v0, s5, v0
+; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
+; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
+; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
+; GFX6-NEXT:    v_min_i32_e32 v0, s3, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
 ; GFX6-NEXT:    s_max_i32 s1, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_sub_i32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s2, s0, -1
-; GFX6-NEXT:    s_sub_i32 s2, s2, s3
+; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
@@ -2883,25 +2834,23 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ;
 ; GFX8-LABEL: ssubsat_v2i16_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s4, s0
-; GFX8-NEXT:    s_sext_i32_i16 s5, -1
-; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX8-NEXT:    s_max_i32 s6, s4, s5
-; GFX8-NEXT:    s_movk_i32 s3, 0x8000
-; GFX8-NEXT:    s_sub_i32 s6, s6, s2
-; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_sext_i32_i16 s2, s0
+; GFX8-NEXT:    s_sext_i32_i16 s3, -1
+; GFX8-NEXT:    s_max_i32 s4, s2, s3
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_sub_i32 s4, s4, s3
-; GFX8-NEXT:    v_max_i16_e32 v1, s6, v0
-; GFX8-NEXT:    v_min_i16_e32 v1, s4, v1
-; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    s_max_i32 s6, s4, s5
-; GFX8-NEXT:    s_sub_i32 s2, s6, s2
-; GFX8-NEXT:    s_min_i32 s4, s4, s5
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    s_sub_i32 s3, s4, s3
+; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    v_max_i16_e32 v1, s4, v0
+; GFX8-NEXT:    v_min_i16_e32 v1, s2, v1
+; GFX8-NEXT:    s_sext_i32_i16 s2, s1
+; GFX8-NEXT:    s_max_i32 s4, s2, s3
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v0, s3, v0
+; GFX8-NEXT:    v_min_i16_e32 v0, s2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_sub_u16_e32 v1, s0, v1
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3123,97 +3072,92 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-LABEL: s_ssubsat_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_max_i32 s10, s0, -1
+; GFX6-NEXT:    s_max_i32 s8, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s8
-; GFX6-NEXT:    s_min_i32 s11, s0, -1
-; GFX6-NEXT:    s_sub_i32 s11, s11, s9
-; GFX6-NEXT:    s_max_i32 s4, s10, s4
-; GFX6-NEXT:    s_min_i32 s4, s4, s11
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s9, s0, -1
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x80000000
+; GFX6-NEXT:    s_max_i32 s4, s8, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s9
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
-; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_min_i32 s10, s1, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s9
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s8, s1, -1
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_min_i32 s4, s4, s10
+; GFX6-NEXT:    s_min_i32 s4, s4, s8
 ; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s6, s2, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s9
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s4, s4, s6
 ; GFX6-NEXT:    s_max_i32 s5, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_sub_i32 s5, s5, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s6, s3, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s9
+; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
-; GFX6-NEXT:    s_min_i32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_sub_i32 s3, s3, s4
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    s_min_i32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s4
-; GFX6-NEXT:    s_and_b32 s2, s3, s4
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s10, s0
-; GFX8-NEXT:    s_sext_i32_i16 s11, -1
-; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX8-NEXT:    s_max_i32 s12, s10, s11
-; GFX8-NEXT:    s_sub_i32 s12, s12, s8
+; GFX8-NEXT:    s_sext_i32_i16 s8, s0
+; GFX8-NEXT:    s_sext_i32_i16 s9, -1
+; GFX8-NEXT:    s_max_i32 s10, s8, s9
+; GFX8-NEXT:    s_sub_i32 s10, s10, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX8-NEXT:    s_movk_i32 s9, 0x8000
-; GFX8-NEXT:    s_min_i32 s10, s10, s11
-; GFX8-NEXT:    s_sext_i32_i16 s12, s12
+; GFX8-NEXT:    s_min_i32 s8, s8, s9
+; GFX8-NEXT:    s_sext_i32_i16 s10, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sub_i32 s10, s10, s9
-; GFX8-NEXT:    s_max_i32 s2, s12, s2
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s2, s10, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sext_i32_i16 s10, s10
+; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_min_i32 s2, s2, s10
+; GFX8-NEXT:    s_min_i32 s2, s2, s8
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s4
-; GFX8-NEXT:    s_max_i32 s10, s2, s11
-; GFX8-NEXT:    s_sub_i32 s10, s10, s8
-; GFX8-NEXT:    s_min_i32 s2, s2, s11
-; GFX8-NEXT:    s_sext_i32_i16 s10, s10
+; GFX8-NEXT:    s_max_i32 s8, s2, s9
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
+; GFX8-NEXT:    s_min_i32 s2, s2, s9
+; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_sub_i32 s2, s2, s9
-; GFX8-NEXT:    s_max_i32 s6, s10, s6
+; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s6, s8, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_min_i32 s2, s6, s2
 ; GFX8-NEXT:    s_sub_i32 s2, s4, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    s_max_i32 s6, s4, s11
-; GFX8-NEXT:    s_sub_i32 s6, s6, s8
+; GFX8-NEXT:    s_max_i32 s6, s4, s9
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX8-NEXT:    s_min_i32 s4, s4, s11
+; GFX8-NEXT:    s_min_i32 s4, s4, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s4, s4, s9
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
@@ -3221,12 +3165,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
-; GFX8-NEXT:    s_max_i32 s4, s3, s11
-; GFX8-NEXT:    s_sub_i32 s4, s4, s8
-; GFX8-NEXT:    s_min_i32 s3, s3, s11
+; GFX8-NEXT:    s_max_i32 s4, s3, s9
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
+; GFX8-NEXT:    s_min_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s7
-; GFX8-NEXT:    s_sub_i32 s3, s3, s9
+; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
@@ -3440,121 +3384,116 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-LABEL: s_ssubsat_v6i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s12, -2
-; GFX6-NEXT:    s_max_i32 s14, s0, -1
+; GFX6-NEXT:    s_max_i32 s12, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_brev_b32 s13, 1
-; GFX6-NEXT:    s_sub_i32 s14, s14, s12
-; GFX6-NEXT:    s_min_i32 s15, s0, -1
-; GFX6-NEXT:    s_sub_i32 s15, s15, s13
-; GFX6-NEXT:    s_max_i32 s6, s14, s6
-; GFX6-NEXT:    s_min_i32 s6, s6, s15
+; GFX6-NEXT:    s_sub_i32 s12, s12, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s13, s0, -1
+; GFX6-NEXT:    s_sub_i32 s13, s13, 0x80000000
+; GFX6-NEXT:    s_max_i32 s6, s12, s6
+; GFX6-NEXT:    s_min_i32 s6, s6, s13
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
 ; GFX6-NEXT:    s_max_i32 s7, s1, -1
-; GFX6-NEXT:    s_sub_i32 s7, s7, s12
-; GFX6-NEXT:    s_min_i32 s14, s1, -1
-; GFX6-NEXT:    s_sub_i32 s14, s14, s13
+; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s12, s1, -1
+; GFX6-NEXT:    s_sub_i32 s12, s12, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_min_i32 s6, s6, s14
+; GFX6-NEXT:    s_min_i32 s6, s6, s12
 ; GFX6-NEXT:    s_max_i32 s7, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s8, s2, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, s13
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s8, s3, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, s13
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s8, s4, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, s13
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s8, s5, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, s13
-; GFX6-NEXT:    s_max_i32 s6, s7, s6
-; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_sub_i32 s5, s5, s6
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
+; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
+; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s6
+; GFX6-NEXT:    s_min_i32 s6, s6, s8
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s0, s0, s6
+; GFX6-NEXT:    s_sub_i32 s5, s5, s6
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s6
-; GFX6-NEXT:    s_and_b32 s2, s3, s6
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s6
+; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s6
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s14, s0
-; GFX8-NEXT:    s_sext_i32_i16 s15, -1
-; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX8-NEXT:    s_max_i32 s16, s14, s15
-; GFX8-NEXT:    s_sub_i32 s16, s16, s12
+; GFX8-NEXT:    s_sext_i32_i16 s12, s0
+; GFX8-NEXT:    s_sext_i32_i16 s13, -1
+; GFX8-NEXT:    s_max_i32 s14, s12, s13
+; GFX8-NEXT:    s_sub_i32 s14, s14, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX8-NEXT:    s_movk_i32 s13, 0x8000
-; GFX8-NEXT:    s_min_i32 s14, s14, s15
-; GFX8-NEXT:    s_sext_i32_i16 s16, s16
+; GFX8-NEXT:    s_min_i32 s12, s12, s13
+; GFX8-NEXT:    s_sext_i32_i16 s14, s14
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sub_i32 s14, s14, s13
-; GFX8-NEXT:    s_max_i32 s3, s16, s3
+; GFX8-NEXT:    s_sub_i32 s12, s12, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s3, s14, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sext_i32_i16 s14, s14
+; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX8-NEXT:    s_min_i32 s3, s3, s14
+; GFX8-NEXT:    s_min_i32 s3, s3, s12
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s6
-; GFX8-NEXT:    s_max_i32 s14, s3, s15
-; GFX8-NEXT:    s_sub_i32 s14, s14, s12
-; GFX8-NEXT:    s_min_i32 s3, s3, s15
-; GFX8-NEXT:    s_sext_i32_i16 s14, s14
+; GFX8-NEXT:    s_max_i32 s12, s3, s13
+; GFX8-NEXT:    s_sub_i32 s12, s12, 0x7fff
+; GFX8-NEXT:    s_min_i32 s3, s3, s13
+; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
-; GFX8-NEXT:    s_sub_i32 s3, s3, s13
-; GFX8-NEXT:    s_max_i32 s9, s14, s9
+; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s9, s12, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_min_i32 s3, s9, s3
 ; GFX8-NEXT:    s_sub_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s1
-; GFX8-NEXT:    s_max_i32 s9, s6, s15
-; GFX8-NEXT:    s_sub_i32 s9, s9, s12
+; GFX8-NEXT:    s_max_i32 s9, s6, s13
+; GFX8-NEXT:    s_sub_i32 s9, s9, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX8-NEXT:    s_min_i32 s6, s6, s15
+; GFX8-NEXT:    s_min_i32 s6, s6, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sub_i32 s6, s6, s13
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s4, s9, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
@@ -3562,25 +3501,25 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s7
-; GFX8-NEXT:    s_max_i32 s6, s4, s15
-; GFX8-NEXT:    s_sub_i32 s6, s6, s12
-; GFX8-NEXT:    s_min_i32 s4, s4, s15
+; GFX8-NEXT:    s_max_i32 s6, s4, s13
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_min_i32 s4, s4, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s10
-; GFX8-NEXT:    s_sub_i32 s4, s4, s13
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s6, s6, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_min_i32 s4, s6, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s2
 ; GFX8-NEXT:    s_sub_i32 s4, s7, s4
-; GFX8-NEXT:    s_max_i32 s7, s6, s15
-; GFX8-NEXT:    s_sub_i32 s7, s7, s12
+; GFX8-NEXT:    s_max_i32 s7, s6, s13
+; GFX8-NEXT:    s_sub_i32 s7, s7, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
-; GFX8-NEXT:    s_min_i32 s6, s6, s15
+; GFX8-NEXT:    s_min_i32 s6, s6, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_sub_i32 s6, s6, s13
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s5, s7, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
@@ -3588,12 +3527,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s8
-; GFX8-NEXT:    s_max_i32 s6, s5, s15
-; GFX8-NEXT:    s_sub_i32 s6, s6, s12
-; GFX8-NEXT:    s_min_i32 s5, s5, s15
+; GFX8-NEXT:    s_max_i32 s6, s5, s13
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX8-NEXT:    s_min_i32 s5, s5, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s11
-; GFX8-NEXT:    s_sub_i32 s5, s5, s13
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s6, s6, s7
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
@@ -3847,145 +3786,140 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-LABEL: s_ssubsat_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s16, -2
-; GFX6-NEXT:    s_max_i32 s18, s0, -1
+; GFX6-NEXT:    s_max_i32 s16, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_brev_b32 s17, 1
-; GFX6-NEXT:    s_sub_i32 s18, s18, s16
-; GFX6-NEXT:    s_min_i32 s19, s0, -1
-; GFX6-NEXT:    s_sub_i32 s19, s19, s17
-; GFX6-NEXT:    s_max_i32 s8, s18, s8
-; GFX6-NEXT:    s_min_i32 s8, s8, s19
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s17, s0, -1
+; GFX6-NEXT:    s_sub_i32 s17, s17, 0x80000000
+; GFX6-NEXT:    s_max_i32 s8, s16, s8
+; GFX6-NEXT:    s_min_i32 s8, s8, s17
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
 ; GFX6-NEXT:    s_max_i32 s9, s1, -1
-; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_min_i32 s18, s1, -1
-; GFX6-NEXT:    s_sub_i32 s18, s18, s17
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s16, s1, -1
+; GFX6-NEXT:    s_sub_i32 s16, s16, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_min_i32 s8, s8, s18
+; GFX6-NEXT:    s_min_i32 s8, s8, s16
 ; GFX6-NEXT:    s_max_i32 s9, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s10, s2, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s17
+; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s10, s3, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s17
+; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s10, s4, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s17
+; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s10, s5, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s17
+; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s6, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s10, s6, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s17
+; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s7, -1
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
-; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s10, s7, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s17
-; GFX6-NEXT:    s_max_i32 s8, s9, s8
-; GFX6-NEXT:    s_min_i32 s8, s8, s10
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX6-NEXT:    s_sub_i32 s7, s7, s8
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
+; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
+; GFX6-NEXT:    s_max_i32 s8, s9, s8
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s0, s0, s8
+; GFX6-NEXT:    s_min_i32 s8, s8, s10
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX6-NEXT:    s_sub_i32 s7, s7, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s8
-; GFX6-NEXT:    s_and_b32 s2, s3, s8
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s8
+; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s8
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s3, s6, s8
+; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s18, s0
-; GFX8-NEXT:    s_sext_i32_i16 s19, -1
-; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
-; GFX8-NEXT:    s_max_i32 s20, s18, s19
-; GFX8-NEXT:    s_sub_i32 s20, s20, s16
+; GFX8-NEXT:    s_sext_i32_i16 s16, s0
+; GFX8-NEXT:    s_sext_i32_i16 s17, -1
+; GFX8-NEXT:    s_max_i32 s18, s16, s17
+; GFX8-NEXT:    s_sub_i32 s18, s18, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NEXT:    s_movk_i32 s17, 0x8000
-; GFX8-NEXT:    s_min_i32 s18, s18, s19
-; GFX8-NEXT:    s_sext_i32_i16 s20, s20
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
+; GFX8-NEXT:    s_sext_i32_i16 s18, s18
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sub_i32 s18, s18, s17
-; GFX8-NEXT:    s_max_i32 s4, s20, s4
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s4, s18, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_sext_i32_i16 s18, s18
+; GFX8-NEXT:    s_sext_i32_i16 s16, s16
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_min_i32 s4, s4, s18
+; GFX8-NEXT:    s_min_i32 s4, s4, s16
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s8
-; GFX8-NEXT:    s_max_i32 s18, s4, s19
-; GFX8-NEXT:    s_sub_i32 s18, s18, s16
-; GFX8-NEXT:    s_min_i32 s4, s4, s19
-; GFX8-NEXT:    s_sext_i32_i16 s18, s18
+; GFX8-NEXT:    s_max_i32 s16, s4, s17
+; GFX8-NEXT:    s_sub_i32 s16, s16, 0x7fff
+; GFX8-NEXT:    s_min_i32 s4, s4, s17
+; GFX8-NEXT:    s_sext_i32_i16 s16, s16
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
-; GFX8-NEXT:    s_sub_i32 s4, s4, s17
-; GFX8-NEXT:    s_max_i32 s12, s18, s12
+; GFX8-NEXT:    s_sub_i32 s4, s4, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s12, s16, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_min_i32 s4, s12, s4
 ; GFX8-NEXT:    s_sub_i32 s4, s8, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s1
-; GFX8-NEXT:    s_max_i32 s12, s8, s19
-; GFX8-NEXT:    s_sub_i32 s12, s12, s16
+; GFX8-NEXT:    s_max_i32 s12, s8, s17
+; GFX8-NEXT:    s_sub_i32 s12, s12, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
-; GFX8-NEXT:    s_min_i32 s8, s8, s19
+; GFX8-NEXT:    s_min_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s5, s12, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
@@ -3993,25 +3927,25 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s5, s5, s8
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s9
-; GFX8-NEXT:    s_max_i32 s8, s5, s19
-; GFX8-NEXT:    s_sub_i32 s8, s8, s16
-; GFX8-NEXT:    s_min_i32 s5, s5, s19
+; GFX8-NEXT:    s_max_i32 s8, s5, s17
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
+; GFX8-NEXT:    s_min_i32 s5, s5, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s13
-; GFX8-NEXT:    s_sub_i32 s5, s5, s17
+; GFX8-NEXT:    s_sub_i32 s5, s5, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s8, s8, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_min_i32 s5, s8, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s2
 ; GFX8-NEXT:    s_sub_i32 s5, s9, s5
-; GFX8-NEXT:    s_max_i32 s9, s8, s19
-; GFX8-NEXT:    s_sub_i32 s9, s9, s16
+; GFX8-NEXT:    s_max_i32 s9, s8, s17
+; GFX8-NEXT:    s_sub_i32 s9, s9, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
-; GFX8-NEXT:    s_min_i32 s8, s8, s19
+; GFX8-NEXT:    s_min_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s6, s9, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
@@ -4019,24 +3953,24 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s6, s6, s8
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s10
-; GFX8-NEXT:    s_max_i32 s8, s6, s19
-; GFX8-NEXT:    s_sub_i32 s8, s8, s16
-; GFX8-NEXT:    s_min_i32 s6, s6, s19
+; GFX8-NEXT:    s_max_i32 s8, s6, s17
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
+; GFX8-NEXT:    s_min_i32 s6, s6, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s14
-; GFX8-NEXT:    s_sub_i32 s6, s6, s17
+; GFX8-NEXT:    s_sub_i32 s6, s6, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s8, s8, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_min_i32 s6, s8, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s3
-; GFX8-NEXT:    s_max_i32 s9, s8, s19
-; GFX8-NEXT:    s_sub_i32 s9, s9, s16
+; GFX8-NEXT:    s_max_i32 s9, s8, s17
+; GFX8-NEXT:    s_sub_i32 s9, s9, 0x7fff
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
-; GFX8-NEXT:    s_min_i32 s8, s8, s19
+; GFX8-NEXT:    s_min_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
-; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s7, s9, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
@@ -4044,15 +3978,15 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_min_i32 s7, s7, s8
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s11
-; GFX8-NEXT:    s_max_i32 s8, s7, s19
-; GFX8-NEXT:    s_sub_i32 s8, s8, s16
+; GFX8-NEXT:    s_max_i32 s8, s7, s17
+; GFX8-NEXT:    s_sub_i32 s8, s8, 0x7fff
 ; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX8-NEXT:    s_min_i32 s7, s7, s19
+; GFX8-NEXT:    s_min_i32 s7, s7, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s15
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_sub_i32 s7, s7, s17
+; GFX8-NEXT:    s_sub_i32 s7, s7, 0xffff8000
 ; GFX8-NEXT:    s_max_i32 s8, s8, s9
 ; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_bfe_u32 s4, s5, 0x100000
@@ -4539,13 +4473,12 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX6-NEXT:    s_mov_b32 s10, 0
+; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
-; GFX6-NEXT:    s_addc_u32 s1, s4, s5
+; GFX6-NEXT:    s_addc_u32 s1, s4, 0x80000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX6-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
@@ -4557,10 +4490,10 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_addc_u32 s3, s4, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    s_addc_u32 s3, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
@@ -4582,13 +4515,12 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX8-NEXT:    s_mov_b32 s10, 0
+; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    s_addc_u32 s1, s4, s5
+; GFX8-NEXT:    s_addc_u32 s1, s4, 0x80000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX8-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -4600,10 +4532,10 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX8-NEXT:    s_mov_b32 s6, 0
+; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX8-NEXT:    s_addc_u32 s3, s4, s5
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX8-NEXT:    s_addc_u32 s3, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
@@ -4625,13 +4557,12 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_mov_b32 s10, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT:    s_brev_b32 s5, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    s_addc_u32 s1, s4, s5
+; GFX9-NEXT:    s_addc_u32 s1, s4, 0x80000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX9-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -4643,10 +4574,10 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_mov_b32 s6, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_addc_u32 s3, s4, s5
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX9-NEXT:    s_addc_u32 s3, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
@@ -4665,15 +4596,14 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, s[4:5], 0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
-; GFX10-NEXT:    s_mov_b32 s11, 0
+; GFX10-NEXT:    s_mov_b32 s10, 0
 ; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_brev_b32 s10, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-NEXT:    s_xor_b32 s8, s4, s1
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s8
-; GFX10-NEXT:    s_addc_u32 s1, s0, s10
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0x80000000
 ; GFX10-NEXT:    s_sub_u32 s4, s2, s6
 ; GFX10-NEXT:    s_subb_u32 s5, s3, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
@@ -4683,9 +4613,9 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
 ; GFX10-NEXT:    s_xor_b32 s2, s3, s2
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
-; GFX10-NEXT:    s_addc_u32 s1, s0, s10
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
@@ -5481,10 +5411,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_addc_u32 s1, s0, 0
-; GFX6-NEXT:    s_brev_b32 s8, 1
 ; GFX6-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s3, s0, s8
+; GFX6-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s16
@@ -5525,7 +5454,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX6-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s7, s4, s8
+; GFX6-NEXT:    s_addc_u32 s7, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s0
@@ -5582,10 +5511,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_addc_u32 s1, s0, 0
-; GFX8-NEXT:    s_brev_b32 s8, 1
 ; GFX8-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s3, s0, s8
+; GFX8-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s16
@@ -5632,7 +5560,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX8-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s7, s4, s8
+; GFX8-NEXT:    s_addc_u32 s7, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
@@ -5689,10 +5617,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_addc_u32 s1, s0, 0
-; GFX9-NEXT:    s_brev_b32 s8, 1
 ; GFX9-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s3, s0, s8
+; GFX9-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s16
@@ -5739,7 +5666,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX9-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s7, s4, s8
+; GFX9-NEXT:    s_addc_u32 s7, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
@@ -5770,7 +5697,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
 ; GFX10-NEXT:    s_subb_u32 s19, s3, s11
-; GFX10-NEXT:    s_brev_b32 s21, 1
 ; GFX10-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -5792,7 +5718,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX10-NEXT:    s_addc_u32 s2, s0, 0
-; GFX10-NEXT:    s_addc_u32 s3, s0, s21
+; GFX10-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX10-NEXT:    s_sub_u32 s8, s4, s12
 ; GFX10-NEXT:    s_subb_u32 s9, s5, s13
 ; GFX10-NEXT:    s_subb_u32 s10, s6, s14
@@ -5838,7 +5764,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX10-NEXT:    s_addc_u32 s2, s0, 0
-; GFX10-NEXT:    s_addc_u32 s3, s0, s21
+; GFX10-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, s1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s2, vcc_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 913cb7306483e..20cd35c1d4692 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -113,44 +113,43 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4
 ; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s1, 0x80008
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s3, s4, s1
+; GFX7-NEXT:    s_bfe_u32 s2, s4, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_lshr_b32 s2, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:1
 ; GFX7-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:3
-; GFX7-NEXT:    s_bfe_u32 s2, s5, s1
+; GFX7-NEXT:    s_bfe_u32 s1, s5, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:5
-; GFX7-NEXT:    s_lshr_b32 s2, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s1, s5, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:6
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:7
-; GFX7-NEXT:    s_bfe_u32 s2, s6, s1
+; GFX7-NEXT:    s_bfe_u32 s1, s6, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    s_lshr_b32 s0, s6, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:8
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:9
-; GFX7-NEXT:    s_lshr_b32 s2, s6, 24
+; GFX7-NEXT:    s_lshr_b32 s1, s6, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:10
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:11
-; GFX7-NEXT:    s_bfe_u32 s1, s7, s1
+; GFX7-NEXT:    s_bfe_u32 s1, s7, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX7-NEXT:    s_lshr_b32 s0, s7, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:12

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index d19684bcbff9f..7777b84030362 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -98,33 +98,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4
 ; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    s_mov_b32 s1, 0x80008
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s3, s4, s1
+; GFX7-NEXT:    s_bfe_u32 s2, s4, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_lshr_b32 s2, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:1
 ; GFX7-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:3
-; GFX7-NEXT:    s_bfe_u32 s2, s5, s1
+; GFX7-NEXT:    s_bfe_u32 s1, s5, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:5
-; GFX7-NEXT:    s_lshr_b32 s2, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s1, s5, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:6
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:7
-; GFX7-NEXT:    s_bfe_u32 s1, s6, s1
+; GFX7-NEXT:    s_bfe_u32 s1, s6, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    s_lshr_b32 s0, s6, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:8

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index ec59cb495898d..59d9ef9ec4257 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -967,7 +967,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
 ; GFX7-LABEL: usubo_i16_sv:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s1, 0xffff
-; GFX7-NEXT:    s_and_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s1, v0
@@ -980,7 +980,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
 ; GFX8-LABEL: usubo_i16_sv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s1, 0xffff
-; GFX8-NEXT:    s_and_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, s1, v0
@@ -992,8 +992,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
 ;
 ; GFX9-LABEL: usubo_i16_sv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
-; GFX9-NEXT:    s_and_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    v_sub_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
index ae5c8f9a4ade0..fc67a417c709b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
@@ -162,23 +162,21 @@ define <2 x i32> @v_trunc_v4i32_to_v4i16(<4 x i32> %src) {
 define amdgpu_ps <2 x i32> @s_trunc_v4i32_to_v4i16(<4 x i32> inreg %src) {
 ; GFX7-LABEL: s_trunc_v4i32_to_v4i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX7-NEXT:    s_and_b32 s0, s0, s4
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_or_b32 s0, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX7-NEXT:    s_and_b32 s2, s2, s4
+; GFX7-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_trunc_v4i32_to_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
   %trunc = trunc <4 x i32> %src to <4 x i16>

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 24284351fc911..5134da030ec2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -242,12 +242,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
@@ -305,17 +304,16 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-LABEL: s_uaddsat_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX9-NEXT:    s_mov_b32 s2, 0x80008
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX9-NEXT:    s_lshl_b32 s2, s3, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
@@ -332,15 +330,14 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s2, s4, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
@@ -466,35 +463,33 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v7
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v3, v6, v2, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v8, v2, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v6
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v5, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v7, v4
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_pk_add_u16 v1, v3, v2 clamp
+; GFX10-NEXT:    v_pk_add_u16 v1, v2, v3 clamp
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -590,27 +585,26 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_mov_b32 s4, 0x80008
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 0x80008
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
@@ -637,39 +631,37 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3 clamp
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_b32_e32 v3, s1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, 24
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 12423fc70269d..0f1b97e9d3ed8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -910,9 +910,8 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX10-LABEL: udivrem_v4i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0x4f7ffffe
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s13
@@ -926,9 +925,9 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX10-NEXT:    s_sub_i32 s1, 0, s13
 ; GFX10-NEXT:    s_sub_i32 s2, 0, s14
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v4
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -2269,24 +2268,24 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-LABEL: udivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX8-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s3, s1, s2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX8-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX8-NEXT:    s_sub_i32 s1, 0, s3
+; GFX8-NEXT:    s_and_b32 s2, s1, 0xffff
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX8-NEXT:    s_sub_i32 s1, 0, s2
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s1, v0
-; GFX8-NEXT:    s_sub_i32 s1, 0, s8
+; GFX8-NEXT:    s_sub_i32 s1, 0, s3
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
@@ -2294,34 +2293,34 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s8
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v2
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s8, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s8, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, s8, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v0, s2, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, s8, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
@@ -2335,54 +2334,53 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX9-LABEL: udivrem_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s7, s1, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT:    s_sub_i32 s1, 0, s7
+; GFX9-NEXT:    s_and_b32 s3, s1, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT:    s_sub_i32 s1, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s3, 0, s6
+; GFX9-NEXT:    s_sub_i32 s6, 0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_and_b32 s9, s0, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s8, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s7
+; GFX9-NEXT:    v_mul_hi_u32 v1, s1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s2
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s0, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2391,43 +2389,42 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
-; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v2, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX10-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX10-NEXT:    s_and_b32 s1, s1, s3
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GFX10-NEXT:    s_sub_i32 s6, 0, s2
+; GFX10-NEXT:    s_sub_i32 s3, 0, s2
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
-; GFX10-NEXT:    s_sub_i32 s6, 0, s1
-; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX10-NEXT:    s_and_b32 s0, s0, s3
+; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
+; GFX10-NEXT:    s_sub_i32 s3, 0, s1
+; GFX10-NEXT:    v_mul_lo_u32 v3, s3, v1
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s3, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
@@ -2445,14 +2442,13 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_or_b32 v0, v1, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, v2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    global_store_dword v1, v2, s[6:7]
@@ -2586,9 +2582,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX8-LABEL: udivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX8-NEXT:    s_mov_b32 s8, 0x7ffffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s7, s7, s8
+; GFX8-NEXT:    s_and_b32 s7, s7, 0x7ffffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s7
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -2596,7 +2591,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX8-NEXT:    s_and_b32 s4, s6, s8
+; GFX8-NEXT:    s_and_b32 s4, s6, 0x7ffffff
+; GFX8-NEXT:    s_mov_b32 s5, 0x7ffffff
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
@@ -2614,11 +2610,11 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
-; GFX8-NEXT:    v_and_b32_e32 v2, s8, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, s5, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_and_b32_e32 v2, s8, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, s5, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -2626,49 +2622,48 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX9-LABEL: udivrem_i27:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX9-NEXT:    s_mov_b32 s6, 0x7ffffff
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s7, s1, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_sub_i32 s1, 0, s7
-; GFX9-NEXT:    s_and_b32 s8, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s1, 0x7ffffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_sub_i32 s1, 0, s6
+; GFX9-NEXT:    s_and_b32 s7, s0, 0x7ffffff
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s4, 0x7ffffff
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s7
+; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s6
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, s7, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v0, s6, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
 ; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_i27:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX10-NEXT:    s_mov_b32 s6, 0x7ffffff
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s7, s1, s6
-; GFX10-NEXT:    s_and_b32 s0, s0, s6
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX10-NEXT:    s_sub_i32 s1, 0, s7
+; GFX10-NEXT:    s_and_b32 s6, s1, 0x7ffffff
+; GFX10-NEXT:    s_and_b32 s0, s0, 0x7ffffff
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX10-NEXT:    s_sub_i32 s1, 0, s6
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -2676,22 +2671,22 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s7
+; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, s6, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x7ffffff, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    global_store_dword v2, v1, s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 738cd237eb017..df31f96f177a1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -272,13 +272,13 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v1, v2
+; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v1, v2
+; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 27de0ccd4b23a..a08c9277393ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -1079,9 +1079,9 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
@@ -1099,67 +1099,67 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s8, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s8
-; GISEL-NEXT:    s_sub_u32 s6, 0, s8
-; GISEL-NEXT:    v_madmk_f32 v5, v4, 0x4f800000, v6
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, s8
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s5, -1, 0x10000
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v4
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GISEL-NEXT:    v_mov_b32_e32 v5, s4
-; GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    s_sub_u32 s9, 0, s8
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_madmk_f32 v9, v7, 0x4f800000, v8
+; GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v7
+; GISEL-NEXT:    s_sub_u32 s9, 0, 0x12d8fb
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; GISEL-NEXT:    s_subb_u32 s10, 0, 0
-; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s11, -1, 0x10000
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
-; GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
+; GISEL-NEXT:    s_bfe_i32 s12, -1, 0x10000
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v7
+; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v8
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
+; GISEL-NEXT:    v_trunc_f32_e32 v10, v10
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v10
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v9
+; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v13, s6, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v14, s7, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v15, s6, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, s9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v17, s10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v18, s9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v16, s9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v17, s10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v18, s9, v8
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v16
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v16
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v16
-; GISEL-NEXT:    v_mul_hi_u32 v16, v9, v16
+; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v16
+; GISEL-NEXT:    v_mul_hi_u32 v16, v10, v16
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v6, v12
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v12
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v11
-; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v11
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v11
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v12
+; GISEL-NEXT:    v_mul_lo_u32 v19, v10, v12
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v17, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
@@ -1167,8 +1167,8 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v18, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
@@ -1183,40 +1183,40 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v11, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v16
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v15, s10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v16, s9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v17, s6, v8
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v16
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v15, s10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v16, s9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v17, s6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v18, v9, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; GISEL-NEXT:    v_mul_lo_u32 v17, s9, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, s9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v12
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v15
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v15
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v13
-; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v13
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v15
+; GISEL-NEXT:    v_mul_lo_u32 v19, v10, v15
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v16, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
@@ -1224,125 +1224,126 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT:    v_mul_hi_u32 v17, v6, v15
+; GISEL-NEXT:    v_mul_hi_u32 v17, v8, v15
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
 ; GISEL-NEXT:    v_mov_b32_e32 v19, s11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
+; GISEL-NEXT:    v_mov_b32_e32 v18, s12
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v15
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v16
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v8
-; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v8
-; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
-; GISEL-NEXT:    v_mul_lo_u32 v18, v2, v9
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v9
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v16, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v12, v6
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v14, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v3, v8
+; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v9
+; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v10
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v10
+; GISEL-NEXT:    v_add_i32_e64 v7, s[6:7], v16, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v13, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, s8, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_lo_u32 v14, s8, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, s8, v7
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, s8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, s8, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v8
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_mul_lo_u32 v12, s8, v8
+; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, s8, v8
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v7, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v7
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v14
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v6
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[6:7]
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v12
+; GISEL-NEXT:    v_subb_u32_e64 v10, s[6:7], v3, v8, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v19, v8, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
-; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, s8, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v10, vcc, s8, v7
-; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v8, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v14, vcc, s8, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v12, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v10, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v11, v14, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v8, v11, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_oddk_denom:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 3a1566b63e501..fe46feea4ebbf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -236,12 +236,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
@@ -297,17 +296,16 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-LABEL: s_usubsat_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX9-NEXT:    s_mov_b32 s2, 0x80008
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX9-NEXT:    s_lshl_b32 s2, s3, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
@@ -324,15 +322,14 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s2, s4, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX10-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
@@ -454,35 +451,33 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v7
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v3, v6, v2, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v8, v2, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v6
+; GFX10-NEXT:    v_and_or_b32 v2, 0xffff, v5, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v7, v4
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_pk_sub_u16 v1, v3, v2 clamp
+; GFX10-NEXT:    v_pk_sub_u16 v1, v2, v3 clamp
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -574,27 +569,26 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_mov_b32 s4, 0x80008
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 0x80008
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
@@ -621,39 +615,37 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
-; GFX10-NEXT:    s_mov_b32 s3, 0x80008
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 0x80008
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_sub_u16 v1, s2, s3 clamp
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_b32_e32 v3, s1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, 24
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 89b3900dc2880..c1a44cef4aa20 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -25,12 +25,11 @@ entry:
 define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ; GFX7-LABEL: scalar_xnor_v2i16_one_use:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX7-NEXT:    s_and_b32 s0, s0, s4
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_or_b32 s0, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX7-NEXT:    s_and_b32 s2, s2, s4
+; GFX7-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX7-NEXT:    s_xor_b32 s0, s0, -1
@@ -42,10 +41,10 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
 ; GFX8-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX8-NEXT:    s_mov_b32 s3, s2
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -117,18 +116,17 @@ define amdgpu_ps i64 @scalar_xnor_i64_one_use(i64 inreg %a, i64 inreg %b) {
 define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) {
 ; GFX7-LABEL: scalar_xnor_v4i16_one_use:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX7-NEXT:    s_and_b32 s0, s0, s8
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_or_b32 s0, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX7-NEXT:    s_and_b32 s2, s2, s8
+; GFX7-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX7-NEXT:    s_and_b32 s3, s4, s8
+; GFX7-NEXT:    s_and_b32 s3, s4, 0xffff
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s3, s7, 16
-; GFX7-NEXT:    s_and_b32 s4, s6, s8
+; GFX7-NEXT:    s_and_b32 s4, s6, 0xffff
 ; GFX7-NEXT:    s_or_b32 s3, s3, s4
 ; GFX7-NEXT:    s_mov_b32 s4, -1
 ; GFX7-NEXT:    s_mov_b32 s5, s4
@@ -142,16 +140,16 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s5, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX8-NEXT:    s_and_b32 s2, s0, s4
+; GFX8-NEXT:    s_and_b32 s2, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX8-NEXT:    s_and_b32 s6, s1, s4
+; GFX8-NEXT:    s_and_b32 s6, s1, 0xffff
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
 ; GFX8-NEXT:    s_xor_b64 s[2:3], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 5b35f592dd272..f88ce8b36825c 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -205,7 +205,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9PLUS: global_load_dword [[B:v[0-9]+]]
 
 ; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
-; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
+; GFX10-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
 ; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9PLUS: buffer_store_dwordx4
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 1bd1a22bf5a9a..a936bcf4ef31e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -150,7 +150,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
 ; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
 ; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
@@ -232,7 +232,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
 ; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
 ; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 9f1e3f9e39227..6b6734efae924 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2526,12 +2526,12 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s2, s6, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_and_b32 s9, s4, s8
+; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_and_b32 s9, s4, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
@@ -2546,11 +2546,11 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    s_and_b32 s4, s5, s8
+; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
@@ -2566,7 +2566,7 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2576,8 +2576,8 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2586,60 +2586,59 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-LABEL: udiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s8, s6, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX9-NEXT:    s_and_b32 s1, s4, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
-; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    s_and_b32 s1, s7, s0
+; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GFX9-NEXT:    v_mad_f32 v4, -v1, v3, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    s_and_b32 s0, s5, s0
+; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX9-NEXT:    s_and_b32 s2, s5, 0xffff
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX9-NEXT:    s_lshr_b32 s2, s7, 16
+; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    s_lshr_b32 s0, s7, 16
-; GFX9-NEXT:    v_mad_f32 v4, -v1, v5, v6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX9-NEXT:    s_lshr_b32 s0, s5, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s0
+; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    s_lshr_b32 s2, s5, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GFX9-NEXT:    v_mad_f32 v4, -v4, v6, v7
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
-; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <4 x i16> %x, %y
   store <4 x i16> %r, <4 x i16> addrspace(1)* %out
@@ -2743,14 +2742,15 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s2, s6, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
+; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    v_alignbit_b32 v4, s7, v4, 16
-; GFX6-NEXT:    s_and_b32 s9, s4, s8
+; GFX6-NEXT:    s_and_b32 s9, s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v5, s8, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
@@ -2772,10 +2772,10 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v1, -v1, v5, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v5
-; GFX6-NEXT:    s_and_b32 s6, s7, s8
+; GFX6-NEXT:    s_and_b32 s6, s7, 0xffff
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GFX6-NEXT:    s_and_b32 s6, s5, s8
+; GFX6-NEXT:    s_and_b32 s6, s5, 0xffff
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
@@ -2801,7 +2801,6 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
 ; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -2815,68 +2814,67 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-LABEL: urem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s8, s6, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    s_and_b32 s1, s4, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
+; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    s_and_b32 s8, s7, s0
-; GFX9-NEXT:    v_mad_f32 v4, -v1, v3, v5
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX9-NEXT:    s_and_b32 s3, s7, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX9-NEXT:    s_and_b32 s8, s5, 0xffff
+; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
-; GFX9-NEXT:    s_and_b32 s0, s5, s0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
-; GFX9-NEXT:    v_mad_f32 v4, -v3, v5, v6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
+; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GFX9-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
-; GFX9-NEXT:    v_sub_u32_e32 v0, s1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GFX9-NEXT:    v_mad_f32 v4, -v4, v6, v7
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v6
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s8
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
-; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, s0, v3
-; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
-; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s6
+; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <4 x i16> %x, %y
   store <4 x i16> %r, <4 x i16> addrspace(1)* %out
@@ -3831,45 +3829,44 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s2, s6, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_and_b32 s9, s4, s8
+; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    s_and_b32 s4, s5, s8
+; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -3878,47 +3875,46 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s8, s6, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX9-NEXT:    s_and_b32 s1, s4, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mad_f32 v2, -v4, v1, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    s_and_b32 s1, s7, s0
+; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GFX9-NEXT:    v_mad_f32 v4, -v2, v3, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    s_and_b32 s0, s5, s0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX9-NEXT:    s_and_b32 s2, s5, 0xffff
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT:    global_store_short v0, v3, s[2:3] offset:4
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    global_store_short v6, v2, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v6, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
@@ -4003,9 +3999,9 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s9, s6, s8
+; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX6-NEXT:    s_and_b32 s2, s4, s8
+; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
@@ -4025,13 +4021,13 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v5, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    s_and_b32 s4, s5, s8
+; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
@@ -4057,51 +4053,51 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s8, s6, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    s_and_b32 s3, s4, s2
+; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    s_and_b32 s7, s7, s2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v4
 ; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s7
-; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
-; GFX9-NEXT:    s_and_b32 s2, s5, s2
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX9-NEXT:    s_and_b32 s3, s7, 0xffff
+; GFX9-NEXT:    v_mad_f32 v2, -v5, v1, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX9-NEXT:    v_mad_f32 v2, -v2, v3, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s7
-; GFX9-NEXT:    v_sub_u32_e32 v0, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s2, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
 ; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -4611,18 +4607,18 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
-; GFX6-NEXT:    s_and_b32 s9, s0, s3
+; GFX6-NEXT:    s_and_b32 s8, s2, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; GFX6-NEXT:    s_and_b32 s9, s0, 0x7fff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_and_b32 s8, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf000f
+; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
 ; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
@@ -4671,8 +4667,8 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX9-NEXT:    s_and_b32 s3, s2, s6
-; GFX9-NEXT:    s_and_b32 s7, s0, s6
+; GFX9-NEXT:    s_and_b32 s3, s2, 0x7fff
+; GFX9-NEXT:    s_and_b32 s7, s0, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf000f
@@ -4800,45 +4796,45 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
-; GFX6-NEXT:    s_and_b32 s10, s0, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
-; GFX6-NEXT:    s_and_b32 s9, s2, s3
+; GFX6-NEXT:    s_and_b32 s9, s2, 0x7fff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX6-NEXT:    s_and_b32 s10, s0, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
 ; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
 ; GFX6-NEXT:    s_bfe_u32 s1, s0, 0xf000f
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
+; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
+; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
+; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
 ; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
+; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
@@ -4868,8 +4864,8 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX9-NEXT:    s_and_b32 s3, s2, s6
-; GFX9-NEXT:    s_and_b32 s8, s0, s6
+; GFX9-NEXT:    s_and_b32 s3, s2, 0x7fff
+; GFX9-NEXT:    s_and_b32 s8, s0, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
 ; GFX9-NEXT:    s_bfe_u32 s3, s0, 0xf000f
@@ -5675,24 +5671,23 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GFX6-NEXT:    s_movk_i32 s2, 0x1000
 ; GFX6-NEXT:    s_mov_b32 s0, 0x4f7ffffe
 ; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s3, s2, s6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_lshl_b32 s2, s2, s7
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
-; GFX6-NEXT:    s_sub_i32 s0, 0, s3
+; GFX6-NEXT:    s_sub_i32 s0, 0, s2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX6-NEXT:    s_sub_i32 s0, 0, s2
+; GFX6-NEXT:    s_sub_i32 s0, 0, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
@@ -5700,25 +5695,25 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
@@ -5726,19 +5721,18 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_movk_i32 s2, 0x1000
+; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s6, s2, s6
+; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    s_lshl_b32 s7, s2, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s3, 0, s7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
@@ -5747,26 +5741,26 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s7
+; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s6, v3
-; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
@@ -5913,12 +5907,11 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_movk_i32 s6, 0xfff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s4, s6
-; GFX6-NEXT:    s_and_b32 s5, s5, s6
+; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
+; GFX6-NEXT:    s_and_b32 s5, s5, 0xfff
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -5928,13 +5921,12 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_and_b32 s0, s3, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xfff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
@@ -6011,27 +6003,26 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GFX6-NEXT:    s_movk_i32 s2, 0x1000
+; GFX6-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s6, s2, s6
+; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_lshl_b32 s7, s2, s7
+; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
-; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s7
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
@@ -6041,12 +6032,12 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -6059,22 +6050,21 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_movk_i32 s2, 0x1000
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, s2, s7
-; GFX9-NEXT:    s_lshl_b32 s2, s2, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
+; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX9-NEXT:    s_mov_b32 s6, 0x4f7ffffe
-; GFX9-NEXT:    s_sub_i32 s7, 0, s3
+; GFX9-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s6, 0, s2
+; GFX9-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
@@ -6084,22 +6074,23 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -6478,140 +6469,138 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GFX6-NEXT:    s_movk_i32 s2, 0x1000
-; GFX6-NEXT:    s_mov_b32 s12, 0x4f7ffffe
-; GFX6-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s3, s2, s6
-; GFX6-NEXT:    s_ashr_i32 s6, s3, 31
-; GFX6-NEXT:    s_add_i32 s3, s3, s6
-; GFX6-NEXT:    s_xor_b32 s3, s3, s6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_lshl_b32 s0, s2, s7
-; GFX6-NEXT:    s_sub_i32 s7, 0, s3
-; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
+; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s10
+; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX6-NEXT:    s_add_i32 s0, s0, s1
+; GFX6-NEXT:    s_xor_b32 s2, s0, s1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s11
+; GFX6-NEXT:    s_sub_i32 s11, 0, s2
+; GFX6-NEXT:    s_ashr_i32 s10, s0, 31
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_add_i32 s0, s0, s2
-; GFX6-NEXT:    s_ashr_i32 s1, s4, 31
-; GFX6-NEXT:    s_mov_b32 s10, -1
-; GFX6-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GFX6-NEXT:    s_add_i32 s0, s0, s10
+; GFX6-NEXT:    s_ashr_i32 s3, s8, 31
+; GFX6-NEXT:    s_add_i32 s8, s8, s3
+; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v0
-; GFX6-NEXT:    s_xor_b32 s7, s0, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s7
-; GFX6-NEXT:    s_add_i32 s0, s4, s1
+; GFX6-NEXT:    s_xor_b32 s12, s3, s1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v0
+; GFX6-NEXT:    s_xor_b32 s11, s0, s10
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s11
+; GFX6-NEXT:    s_xor_b32 s0, s8, s3
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    s_xor_b32 s0, s0, s1
+; GFX6-NEXT:    s_sub_i32 s3, 0, s11
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    s_xor_b32 s4, s1, s6
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s12, v2
+; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX6-NEXT:    s_sub_i32 s0, 0, s7
-; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
-; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
-; GFX6-NEXT:    s_add_i32 s1, s5, s0
+; GFX6-NEXT:    v_mul_lo_u32 v3, s3, v1
+; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX6-NEXT:    s_add_i32 s1, s9, s0
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GFX6-NEXT:    s_xor_b32 s2, s0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s7
+; GFX6-NEXT:    s_xor_b32 s2, s0, s10
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s11
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s11, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    s_mov_b32 s10, 0x4f7ffffe
+; GFX9-NEXT:    s_mov_b32 s11, 0x4f7ffffe
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s1, s0, s6
-; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
-; GFX9-NEXT:    s_add_i32 s1, s1, s6
-; GFX9-NEXT:    s_xor_b32 s1, s1, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s7
-; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s8
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s6
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s7
+; GFX9-NEXT:    s_ashr_i32 s9, s6, 31
+; GFX9-NEXT:    s_add_i32 s6, s6, s9
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX9-NEXT:    s_sub_i32 s9, 0, s1
-; GFX9-NEXT:    v_mul_f32_e32 v0, s10, v0
+; GFX9-NEXT:    s_xor_b32 s6, s6, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    s_sub_i32 s10, 0, s0
+; GFX9-NEXT:    v_mul_f32_e32 v0, s11, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
 ; GFX9-NEXT:    s_add_i32 s4, s4, s7
-; GFX9-NEXT:    v_mul_lo_u32 v3, s9, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s10, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, s11, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s7
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    s_sub_i32 s10, 0, s0
-; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX9-NEXT:    s_add_i32 s5, s5, s9
+; GFX9-NEXT:    s_sub_i32 s10, 0, s6
+; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX9-NEXT:    s_add_i32 s5, s5, s8
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT:    s_xor_b32 s6, s7, s6
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s1
+; GFX9-NEXT:    s_xor_b32 s5, s5, s8
+; GFX9-NEXT:    s_xor_b32 s1, s7, s1
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v4
-; GFX9-NEXT:    s_xor_b32 s4, s5, s9
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s1, v4
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    s_xor_b32 s1, s9, s8
-; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
+; GFX9-NEXT:    s_xor_b32 s0, s8, s9
+; GFX9-NEXT:    v_xor_b32_e32 v0, s1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s0, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s1, v1
+; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s1, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -6806,19 +6795,18 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_movk_i32 s6, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s7, s4, 31
-; GFX6-NEXT:    s_lshr_b32 s7, s7, 20
-; GFX6-NEXT:    s_add_i32 s7, s4, s7
-; GFX6-NEXT:    s_and_b32 s7, s7, s6
-; GFX6-NEXT:    s_sub_i32 s4, s4, s7
+; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
+; GFX6-NEXT:    s_add_i32 s6, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s7, s5, 31
-; GFX6-NEXT:    s_lshr_b32 s7, s7, 20
-; GFX6-NEXT:    s_add_i32 s7, s5, s7
-; GFX6-NEXT:    s_and_b32 s6, s7, s6
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; GFX6-NEXT:    s_sub_i32 s4, s4, s6
+; GFX6-NEXT:    s_lshr_b32 s6, s7, 20
+; GFX6-NEXT:    s_add_i32 s6, s5, s6
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xfffff000
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
@@ -6829,21 +6817,20 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s0, 0xf000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s2, 31
+; GFX9-NEXT:    s_ashr_i32 s0, s2, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s3, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX9-NEXT:    s_add_i32 s1, s2, s1
-; GFX9-NEXT:    s_ashr_i32 s6, s3, 31
-; GFX9-NEXT:    s_and_b32 s1, s1, s0
-; GFX9-NEXT:    s_sub_i32 s1, s2, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s6, 20
-; GFX9-NEXT:    s_add_i32 s2, s3, s2
-; GFX9-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NEXT:    s_sub_i32 s0, s3, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_i32 s0, s2, s0
+; GFX9-NEXT:    s_add_i32 s1, s3, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX9-NEXT:    s_sub_i32 s0, s2, s0
+; GFX9-NEXT:    s_sub_i32 s1, s3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
@@ -6936,39 +6923,38 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GFX6-NEXT:    s_movk_i32 s8, 0x1000
-; GFX6-NEXT:    s_mov_b32 s11, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s9, 0x4f7ffffe
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, s8, s6
+; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
 ; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
 ; GFX6-NEXT:    s_xor_b32 s6, s2, s3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_lshl_b32 s7, s8, s7
-; GFX6-NEXT:    s_ashr_i32 s9, s7, 31
-; GFX6-NEXT:    s_add_i32 s7, s7, s9
+; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s7
+; GFX6-NEXT:    s_ashr_i32 s7, s2, 31
+; GFX6-NEXT:    s_add_i32 s2, s2, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s10, 0, s6
-; GFX6-NEXT:    s_xor_b32 s7, s7, s9
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s7
-; GFX6-NEXT:    v_mul_f32_e32 v0, s11, v0
+; GFX6-NEXT:    s_xor_b32 s7, s2, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
+; GFX6-NEXT:    v_mul_f32_e32 v0, s9, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    s_add_i32 s4, s4, s8
-; GFX6-NEXT:    v_mul_lo_u32 v1, s10, v0
-; GFX6-NEXT:    s_xor_b32 s4, s4, s8
-; GFX6-NEXT:    s_sub_i32 s10, 0, s7
-; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s11, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
+; GFX6-NEXT:    s_add_i32 s2, s4, s8
+; GFX6-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX6-NEXT:    s_xor_b32 s4, s2, s8
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    s_sub_i32 s2, 0, s7
+; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -7000,64 +6986,63 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    s_mov_b32 s8, 0x4f7ffffe
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_mov_b32 s9, 0x4f7ffffe
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s1, s0, s6
-; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
-; GFX9-NEXT:    s_add_i32 s1, s1, s6
-; GFX9-NEXT:    s_xor_b32 s1, s1, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s7
-; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s6
-; GFX9-NEXT:    s_xor_b32 s0, s0, s6
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX9-NEXT:    s_add_i32 s3, s3, s6
+; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
+; GFX9-NEXT:    s_xor_b32 s3, s3, s6
+; GFX9-NEXT:    s_ashr_i32 s7, s2, 31
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_add_i32 s2, s2, s7
+; GFX9-NEXT:    s_xor_b32 s2, s2, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX9-NEXT:    s_sub_i32 s7, 0, s1
+; GFX9-NEXT:    s_sub_i32 s8, 0, s3
 ; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
-; GFX9-NEXT:    v_mul_f32_e32 v0, s8, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, s9, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s6
-; GFX9-NEXT:    s_xor_b32 s4, s4, s6
-; GFX9-NEXT:    v_mul_f32_e32 v1, s8, v1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s7, 0, s0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v0
+; GFX9-NEXT:    s_sub_i32 s8, 0, s2
+; GFX9-NEXT:    s_xor_b32 s4, s4, s6
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
 ; GFX9-NEXT:    s_add_i32 s5, s5, s7
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    s_xor_b32 s5, s5, s7
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s7, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s7, v1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = srem <2 x i32> %x, %shl.y
@@ -7079,36 +7064,35 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_movk_i32 s4, 0xfee0
 ; GFX6-NEXT:    s_mov_b32 s5, 0x68958c89
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    s_movk_i32 s8, 0x11f
+; GFX6-NEXT:    s_mov_b32 s9, 0x976a7377
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
-; GFX6-NEXT:    s_mov_b32 s9, 0x976a7377
+; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -7124,7 +7108,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
@@ -7133,7 +7117,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -7149,13 +7133,13 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s9
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
-; GFX6-NEXT:    v_mov_b32_e32 v5, s8
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x11f
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s9
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -7201,14 +7185,13 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s2, 0xfee0
 ; GFX9-NEXT:    s_mov_b32 s3, 0x68958c89
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
@@ -7217,16 +7200,16 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -7240,17 +7223,17 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX9-NEXT:    s_mov_b32 s3, 0x976a7377
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -7265,32 +7248,33 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x11f
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
 ; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s3, v3
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
 ; GFX9-NEXT:    s_movk_i32 s3, 0x11e
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
 ; GFX9-NEXT:    s_mov_b32 s6, 0x976a7376
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v5, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], 2, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
@@ -7305,10 +7289,10 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i64 %x, 1235195949943
   store i64 %r, i64 addrspace(1)* %out
@@ -7454,9 +7438,9 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_movk_i32 s6, 0xf001
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -7467,23 +7451,22 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], 12
 ; GFX6-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s6
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -7497,7 +7480,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
@@ -7505,7 +7488,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -7521,7 +7504,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
@@ -7563,7 +7546,6 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s2, 0xf001
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -7575,75 +7557,76 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v3
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s2
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX9-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s0
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s0
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
@@ -7652,16 +7635,16 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
@@ -7736,38 +7719,37 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
 ; GFX6-NEXT:    s_mov_b32 s3, 0x689e0837
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s12, 0x9761f7c9
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s8, s4
 ; GFX6-NEXT:    s_movk_i32 s4, 0x11f
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
+; GFX6-NEXT:    s_mov_b32 s12, 0x9761f7c9
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX6-NEXT:    s_mov_b32 s9, s5
 ; GFX6-NEXT:    s_movk_i32 s5, 0x11e
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -7783,7 +7765,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
@@ -7791,7 +7773,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -7807,7 +7789,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
@@ -7817,7 +7799,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x11f
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
@@ -7857,35 +7839,34 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s2, 0xfee0
 ; GFX9-NEXT:    s_mov_b32 s3, 0x689e0837
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_movk_i32 s8, 0x11f
+; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s3
-; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
+; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -7898,16 +7879,16 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -7924,46 +7905,47 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x11f
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s9, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
 ; GFX9-NEXT:    s_movk_i32 s6, 0x11e
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v5
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v4
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v5
 ; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v5
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i64 %x, 1235195393993
   store i64 %r, i64 addrspace(1)* %out
@@ -8066,13 +8048,12 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s5, 0xfff
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_and_b32 s4, s4, s5
-; GFX6-NEXT:    s_and_b32 s5, s6, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
+; GFX6-NEXT:    s_and_b32 s5, s6, 0xfff
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
@@ -8083,14 +8064,13 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s4, s0
-; GFX9-NEXT:    s_and_b32 s0, s6, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_and_b32 s0, s4, 0xfff
+; GFX9-NEXT:    s_and_b32 s1, s6, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i64> %x, <i64 4096, i64 4096>
@@ -8174,8 +8154,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s5, 0xffed2705
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -8192,12 +8172,12 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s8
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
@@ -8205,7 +8185,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -8213,23 +8193,22 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v5, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -8246,7 +8225,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
@@ -8291,7 +8270,6 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -8306,16 +8284,16 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -8328,18 +8306,18 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    s_add_u32 s0, s6, s2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -8349,32 +8327,33 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, s1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s3, v9
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
@@ -8383,9 +8362,9 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -8395,7 +8374,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i64 %x, 1235195
   store i64 %r, i64 addrspace(1)* %out
@@ -8501,48 +8480,47 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v6, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
@@ -8594,7 +8572,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
@@ -8614,114 +8591,115 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, s10, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
-; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_add_u32 s0, s6, s10
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v4, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s9
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v0
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
-; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v3
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = sdiv i64 %x, %shl.y
@@ -8842,10 +8820,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    s_add_u32 s0, s2, s10
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s6
@@ -8856,36 +8833,36 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s1, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX6-NEXT:    s_movk_i32 s2, 0xfff
 ; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v6, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
@@ -8933,95 +8910,95 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s8, 0xf001
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX9-NEXT:    s_ashr_i64 s[4:5], s[0:1], 12
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
+; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
+; GFX9-NEXT:    s_add_u32 s0, s6, s8
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 20
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX9-NEXT:    s_add_u32 s2, s4, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX9-NEXT:    s_addc_u32 s3, s5, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
-; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    s_add_u32 s6, s6, s4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s7, s7, s4
+; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    s_addc_u32 s1, s7, s8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GFX9-NEXT:    v_mul_hi_u32 v5, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    s_movk_i32 s0, 0xfff
+; GFX9-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GFX9-NEXT:    s_movk_i32 s6, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s0
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
-; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s6
+; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
@@ -9038,14 +9015,14 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s8, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
@@ -9096,201 +9073,200 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s17, s16
 ; GFX6-NEXT:    s_addc_u32 s1, s5, s16
-; GFX6-NEXT:    v_mul_lo_u32 v0, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
-; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
+; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v0
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[16:17]
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v0, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v5
-; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
+; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX6-NEXT:    s_xor_b64 s[14:15], s[16:17], s[14:15]
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, s4, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s5, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mov_b32 s10, -1
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v1
-; GFX6-NEXT:    v_mul_lo_u32 v5, s13, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, s13
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v1
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v3
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
-; GFX6-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s12, v4
-; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v6
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, s13
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s12, v3
+; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v6, s[0:1], 2, v1
-; GFX6-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, v2, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v1
-; GFX6-NEXT:    v_addc_u32_e64 v9, s[0:1], 0, v2, s[0:1]
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX6-NEXT:    s_add_u32 s2, s2, s4
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v7, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s3
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v3
-; GFX6-NEXT:    v_mac_f32_e32 v9, s18, v10
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
-; GFX6-NEXT:    v_rcp_f32_e32 v4, v9
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
-; GFX6-NEXT:    v_mul_f32_e32 v4, s19, v4
-; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v4
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mac_f32_e32 v4, s21, v5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s3
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
+; GFX6-NEXT:    v_mac_f32_e32 v8, s18, v9
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v8
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
+; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v3
+; GFX6-NEXT:    v_mul_f32_e32 v4, s20, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX6-NEXT:    s_sub_u32 s0, 0, s2
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v3
+; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
 ; GFX6-NEXT:    s_subb_u32 s1, 0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v7, s1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
 ; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v4, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v3
-; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v5, v3
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v6
-; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
+; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; GFX6-NEXT:    s_mov_b32 s13, s12
-; GFX6-NEXT:    v_xor_b32_e32 v1, s14, v1
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v10, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v5, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v6, s0, v3
-; GFX6-NEXT:    v_mul_lo_u32 v7, s1, v3
-; GFX6-NEXT:    v_xor_b32_e32 v2, s15, v2
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v3
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v6
-; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GFX6-NEXT:    v_mul_lo_u32 v6, v4, v6
-; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v5
-; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GFX6-NEXT:    v_addc_u32_e32 v10, vcc, 0, v11, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v7, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v0, s14, v0
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, s15, v1
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, v2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    s_add_u32 s0, s6, s12
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    s_addc_u32 s1, s7, s12
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v4
-; GFX6-NEXT:    v_mul_hi_u32 v6, s6, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, s6, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, s7, v4
-; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, s15
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v9, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v4
-; GFX6-NEXT:    v_mul_hi_u32 v6, s2, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v1
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v7, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s3, v3
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s6, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, s7, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, s7, v3
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v7, s7, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, s15
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s7, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s6, v5
 ; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
 ; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s2, v5
@@ -9301,30 +9277,30 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v3
-; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v4, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
+; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
+; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s7
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v5, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[4:5]
-; GFX6-NEXT:    v_xor_b32_e32 v3, s0, v3
-; GFX6-NEXT:    v_xor_b32_e32 v4, s1, v2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v3
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_xor_b32_e32 v3, s1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, s1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
@@ -9358,75 +9334,75 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GFX9-NEXT:    v_add_u32_e32 v0, v3, v0
-; GFX9-NEXT:    v_add_u32_e32 v5, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
 ; GFX9-NEXT:    s_add_u32 s2, s4, s14
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v6
-; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s14
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
-; GFX9-NEXT:    v_mul_lo_u32 v3, s4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, s4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s5, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s5, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v2, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s9
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v1
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
@@ -9495,7 +9471,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v10, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
@@ -9518,7 +9494,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v4, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v7, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
@@ -9538,7 +9514,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s13, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v4
@@ -9607,8 +9583,8 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffed2705
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -9626,42 +9602,41 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s4
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -9669,7 +9644,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
@@ -9679,7 +9654,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
@@ -9722,7 +9697,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -9737,16 +9711,16 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -9759,18 +9733,18 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    s_add_u32 s0, s6, s2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -9781,7 +9755,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
@@ -9790,41 +9764,42 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s3, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s3, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s3, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
   store i64 %r, i64 addrspace(1)* %out
@@ -9934,48 +9909,47 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s12, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s13, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s13, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s13, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s13, v0
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v6, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v1
@@ -10025,7 +9999,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
@@ -10050,89 +10023,90 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_add_u32 s0, s6, s10
 ; GFX9-NEXT:    s_mov_b32 s11, s10
-; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
-; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
+; GFX9-NEXT:    s_addc_u32 s1, s7, s10
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
-; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v4, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
+; GFX9-NEXT:    v_mul_hi_u32 v2, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s9, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
@@ -10142,15 +10116,15 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s10, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s10, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = srem i64 %x, %shl.y
@@ -10173,24 +10147,23 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_movk_i32 s8, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX6-NEXT:    s_lshr_b32 s9, s9, 20
-; GFX6-NEXT:    s_add_u32 s9, s4, s9
-; GFX6-NEXT:    s_addc_u32 s10, s5, 0
-; GFX6-NEXT:    s_and_b32 s9, s9, s8
-; GFX6-NEXT:    s_sub_u32 s4, s4, s9
-; GFX6-NEXT:    s_subb_u32 s5, s5, s10
-; GFX6-NEXT:    s_ashr_i32 s9, s7, 31
-; GFX6-NEXT:    s_lshr_b32 s9, s9, 20
-; GFX6-NEXT:    s_add_u32 s9, s6, s9
-; GFX6-NEXT:    s_addc_u32 s10, s7, 0
-; GFX6-NEXT:    s_and_b32 s8, s9, s8
+; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
+; GFX6-NEXT:    s_add_u32 s8, s4, s8
+; GFX6-NEXT:    s_addc_u32 s9, s5, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 0xfffff000
+; GFX6-NEXT:    s_sub_u32 s4, s4, s8
+; GFX6-NEXT:    s_subb_u32 s5, s5, s9
+; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
+; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
+; GFX6-NEXT:    s_add_u32 s8, s6, s8
+; GFX6-NEXT:    s_addc_u32 s9, s7, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 0xfffff000
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s8
-; GFX6-NEXT:    s_subb_u32 s7, s7, s10
+; GFX6-NEXT:    s_subb_u32 s7, s7, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
@@ -10202,26 +10175,25 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s0, 0xf000
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s5, 31
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX9-NEXT:    s_add_u32 s1, s4, s1
-; GFX9-NEXT:    s_addc_u32 s8, s5, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, s0
-; GFX9-NEXT:    s_sub_u32 s1, s4, s1
-; GFX9-NEXT:    s_subb_u32 s4, s5, s8
-; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
-; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
-; GFX9-NEXT:    s_add_u32 s5, s6, s5
-; GFX9-NEXT:    s_addc_u32 s8, s7, 0
-; GFX9-NEXT:    s_and_b32 s0, s5, s0
-; GFX9-NEXT:    s_sub_u32 s0, s6, s0
-; GFX9-NEXT:    s_subb_u32 s5, s7, s8
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, 0
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT:    s_sub_u32 s0, s4, s0
+; GFX9-NEXT:    s_subb_u32 s1, s5, s1
+; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_add_u32 s4, s6, s4
+; GFX9-NEXT:    s_addc_u32 s5, s7, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX9-NEXT:    s_sub_u32 s4, s6, s4
+; GFX9-NEXT:    s_subb_u32 s5, s7, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
@@ -10274,202 +10246,201 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    s_addc_u32 s1, s5, s12
-; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v0
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[12:13]
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v0, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v5
-; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
+; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX6-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s10, -1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, s4, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s5, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v3, s16, v1
-; GFX6-NEXT:    v_mul_lo_u32 v4, s17, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s16, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v4, s17
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
-; GFX6-NEXT:    v_subb_u32_e64 v3, s[0:1], v3, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v1
-; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[2:3], 0, v3, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v6
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v1, s16, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v0
+; GFX6-NEXT:    v_mul_lo_u32 v3, s17, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, s16, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s16, v0
+; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v4
+; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
-; GFX6-NEXT:    v_subb_u32_e64 v3, s[0:1], v3, v4, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v6
-; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s16, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
-; GFX6-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v5
+; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GFX6-NEXT:    s_add_u32 s4, s14, s2
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v6, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX6-NEXT:    s_mov_b32 s3, s2
 ; GFX6-NEXT:    s_addc_u32 s5, s15, s2
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s5
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v2
-; GFX6-NEXT:    v_mac_f32_e32 v7, s18, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
-; GFX6-NEXT:    v_mul_f32_e32 v4, s19, v7
-; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v4
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mac_f32_e32 v4, s21, v5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s5
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
+; GFX6-NEXT:    v_mac_f32_e32 v6, s18, v7
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
+; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v6
+; GFX6-NEXT:    v_mul_f32_e32 v4, s20, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX6-NEXT:    s_sub_u32 s0, 0, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v3
+; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
 ; GFX6-NEXT:    s_subb_u32 s1, 0, s5
-; GFX6-NEXT:    v_mul_lo_u32 v7, s1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
 ; GFX6-NEXT:    s_ashr_i32 s14, s7, 31
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v4, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v3
-; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v5, v3
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v6
-; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
+; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; GFX6-NEXT:    s_mov_b32 s15, s14
+; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v10, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v5, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v6, s0, v3
-; GFX6-NEXT:    v_mul_lo_u32 v7, s1, v3
-; GFX6-NEXT:    v_xor_b32_e32 v2, s12, v2
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v3
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v6
-; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GFX6-NEXT:    v_mul_lo_u32 v6, v4, v6
-; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v5
-; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GFX6-NEXT:    v_addc_u32_e32 v10, vcc, 0, v11, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v7, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, v2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    s_add_u32 s0, s6, s14
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    s_addc_u32 s1, s7, s14
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[14:15]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v4
-; GFX6-NEXT:    v_mul_hi_u32 v6, s6, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, s6, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, s7, v4
-; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX6-NEXT:    v_mov_b32_e32 v7, s12
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v9, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v0, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s4, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v1
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v7, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s5, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s6, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, s7, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, s7, v3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v7, s7, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, s12
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
+; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v2
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s5
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s4, v3
+; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s4, v2
 ; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
@@ -10483,22 +10454,22 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s7
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v7, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
-; GFX6-NEXT:    v_xor_b32_e32 v4, s14, v2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s14
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v3
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v4, s14
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
@@ -10532,74 +10503,74 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GFX9-NEXT:    v_add_u32_e32 v0, v3, v0
-; GFX9-NEXT:    v_add_u32_e32 v5, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
 ; GFX9-NEXT:    s_add_u32 s2, s4, s8
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v6
-; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s8
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
-; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s15, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s15, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s15, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s15, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, s15, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v2, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s12, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s15, v2
@@ -10667,7 +10638,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_mov_b32 s13, s12
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v10, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
@@ -10690,7 +10661,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v4, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v7, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
@@ -10710,7 +10681,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s8, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 8cad7d033ac8e..e82f2c6ea9171 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -63,9 +63,8 @@ define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
 
 ; Second use is a VGPR use of the constant.
 ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0:
-; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
-; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
-; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
+; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687
+; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x12d687
 ; SI: buffer_store_dword [[VK]]
 define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
@@ -79,10 +78,9 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out
 
 ; Second use is another SGPR use of the constant.
 ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1:
-; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
-; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
+; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687
 ; SI: s_add_i32
-; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
+; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, 0x12d687
 ; SI: v_mov_b32_e32 [[VADD:v[0-9]+]], [[ADD]]
 ; SI: buffer_store_dword [[VADD]]
 define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index c855060e12e2c..e478e2c0b62b7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -3067,7 +3067,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX8-NEXT:    s_not_b64 exec, exec
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3118,7 +3118,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:    s_not_b64 exec, exec
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3160,41 +3160,39 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: max_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1064-NEXT:    s_not_b64 exec, exec
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
-; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
+; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
-; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
+; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
+; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
+; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
 ; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
@@ -3214,7 +3212,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3223,31 +3221,29 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: max_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
+; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
+; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
+; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
+; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
@@ -3266,7 +3262,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3484,7 +3480,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
 ; GFX8-NEXT:    s_not_b64 exec, exec
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3535,7 +3531,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
 ; GFX9-NEXT:    s_not_b64 exec, exec
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3577,41 +3573,39 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: min_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX1064-NEXT:    s_not_b64 exec, exec
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
+; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
-; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
+; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
-; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
+; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
+; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
+; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
 ; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
@@ -3631,7 +3625,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3640,31 +3634,29 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: min_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
+; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
+; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
+; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
+; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
+; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
@@ -3683,7 +3675,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 907ba8dd3086d..9077a28571375 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -27,7 +27,6 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
 ; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
@@ -42,11 +41,11 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v15, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v14, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
@@ -57,13 +56,13 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
-; GFX9-NEXT:    v_mul_hi_u32 v15, v13, v2
+; GFX9-NEXT:    v_mul_hi_u32 v14, v13, v2
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v15, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v14, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
@@ -81,7 +80,7 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v14, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v2
@@ -175,7 +174,6 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
@@ -190,32 +188,32 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
 ; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
@@ -224,7 +222,7 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v6
@@ -318,7 +316,6 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
@@ -333,11 +330,11 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v13, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v6, v2
@@ -348,13 +345,13 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0
-; GFX9-NEXT:    v_mul_hi_u32 v14, v12, v2
+; GFX9-NEXT:    v_mul_hi_u32 v13, v12, v2
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
@@ -372,7 +369,7 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v9, v0
@@ -462,7 +459,6 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
@@ -477,32 +473,32 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
 ; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
@@ -511,7 +507,7 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v4
@@ -728,7 +724,6 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
 ; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
@@ -743,11 +738,11 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v15, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v14, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
@@ -758,13 +753,13 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
-; GFX9-NEXT:    v_mul_hi_u32 v15, v13, v2
+; GFX9-NEXT:    v_mul_hi_u32 v14, v13, v2
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v15, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v14, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
@@ -782,7 +777,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v14, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v2
@@ -896,7 +891,6 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
@@ -911,32 +905,32 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
 ; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
@@ -945,7 +939,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index 65ffba83df95b..c9d23d76fdfd0 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: _Z11test_kernelPii:
 ; CHECK: s_mul_i32
 ; CHECK: s_sub_i32
-; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, 0xffff
 ; CHECK: s_add_i32 [[S2:s[0-9]+]], {{s[0-9]+}}, [[S1]]
 ; CHECK: s_or_b32 {{s[0-9]+}}, [[S2]], 0xc0
 

diff  --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index 22ac0fa31743e..cf55eb8d2f12a 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -226,7 +226,6 @@ define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 {
 
 ; GCN-LABEL: {{^}}load_sampler
 ; GCN: v_readfirstlane_b32
-; GCN-NEXT: v_readfirstlane_b32
 ; SI: s_nop
 ; GCN: s_load_dwordx8
 ; GCN-NEXT: s_load_dwordx4
@@ -260,7 +259,6 @@ main_body:
 
 ; GCN-LABEL: {{^}}load_sampler_nouniform
 ; GCN: v_readfirstlane_b32
-; GCN-NEXT: v_readfirstlane_b32
 ; SI: s_nop
 ; GCN: s_load_dwordx8
 ; GCN-NEXT: s_load_dwordx4

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index f757ad818ac0d..05e3acd2591d6 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1500,15 +1500,13 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-GISEL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, s2, vcc_lo
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, i16 addrspace(1)* %valptr
@@ -1604,19 +1602,18 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out,
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x7f
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 25, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 3000858c7786c..8603cd75e3562 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1492,14 +1492,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-GISEL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, s2, vcc_lo
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, i16 addrspace(1)* %valptr
@@ -1595,18 +1593,17 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out,
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x7f
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x80, v0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 8de42ad09100c..05b13a985935b 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1205,7 +1205,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v5, 9
-; VI-NEXT:    s_movk_i32 s8, 0x900
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -1232,8 +1231,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
 ; VI-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_mov_b32_e32 v2, s8
-; VI-NEXT:    v_add_u16_e32 v0, s8, v0
+; VI-NEXT:    v_mov_b32_e32 v2, 0x900
+; VI-NEXT:    v_add_u16_e32 v0, 0x900, v0
 ; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1250,7 +1249,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX10-NEXT:    s_movk_i32 s0, 0x900
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1261,8 +1259,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, s0
-; GFX10-NEXT:    v_add_nc_u16 v5, v2, s0
+; GFX10-NEXT:    v_add_nc_u16 v1, v1, 0x900
+; GFX10-NEXT:    v_add_nc_u16 v5, v2, 0x900
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index c686598225a25..c3262b4671ed2 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -71,21 +71,20 @@ define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspac
 ; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
 ; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT:    s_mov_b32 s4, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v3, 0x8000
-; SI-NEXT:    v_mov_b32_e32 v4, 0xffff0000
-; SI-NEXT:    v_bfrev_b32_e32 v5, 1
-; SI-NEXT:    v_mov_b32_e32 v6, 0xffff8000
-; SI-NEXT:    v_mov_b32_e32 v7, s4
+; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
+; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
+; SI-NEXT:    v_bfrev_b32_e32 v6, 1
+; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_and_b32_e32 v2, s4, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index d9ae5421516ed..38243711c0e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -42,9 +42,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
 ; CI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
 ; GFX89: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x8
 
-; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
-; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
-; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
+; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7fff7fff
+; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], 0x7fff7fff
 ; GCN: {{flat|global}}_store_dwordx2
 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)

diff  --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
index 809e7de631147..1629d55dcda96 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -30,8 +30,8 @@ define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) {
 }
 
 ; FUNC-LABEL: {{^}}fabs_v2f64:
-; SI: s_and_b32
-; SI: s_and_b32
+; SI: s_bitset0_b32
+; SI: s_bitset0_b32
 ; SI: s_endpgm
 define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
@@ -40,10 +40,10 @@ define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 }
 
 ; FUNC-LABEL: {{^}}fabs_v4f64:
-; SI: s_and_b32
-; SI: s_and_b32
-; SI: s_and_b32
-; SI: s_and_b32
+; SI: s_bitset0_b32
+; SI: s_bitset0_b32
+; SI: s_bitset0_b32
+; SI: s_bitset0_b32
 ; SI: s_endpgm
 define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)

diff  --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index b6ad0c49dc26a..d98a77a52903c 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; GCN: s_and_b32
-; GCN: s_and_b32
+; GCN: s_bitset0_b32
+; GCN: s_bitset0_b32
 define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@@ -59,10 +59,10 @@ define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; GCN: s_and_b32
-; GCN: s_and_b32
-; GCN: s_and_b32
-; GCN: s_and_b32
+; GCN: s_bitset0_b32
+; GCN: s_bitset0_b32
+; GCN: s_bitset0_b32
+; GCN: s_bitset0_b32
 define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   store <4 x float> %fabs, <4 x float> addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll
index 7eaac9421f566..50e9533890b77 100644
--- a/llvm/test/CodeGen/AMDGPU/fexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fexp.ll
@@ -107,10 +107,9 @@ define <2 x half> @v_exp_v2f16(<2 x half> %arg0) {
 ; VI-LABEL: v_exp_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_movk_i32 s4, 0x3dc5
-; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3dc5
 ; VI-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_mul_f16_e32 v0, s4, v0
+; VI-NEXT:    v_mul_f16_e32 v0, 0x3dc5, v0
 ; VI-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_exp_f16_e32 v0, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -161,7 +160,7 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_movk_i32 s4, 0x3dc5
-; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3dc5
 ; VI-NEXT:    v_mul_f16_e32 v2, s4, v1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e32 v4, s4, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index cba84f5dfe4a6..81ff336ef3210 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -532,12 +532,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX10-NEXT:    scratch_store_dword v2, v3, off
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -585,12 +584,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
+; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -639,12 +637,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s32
-; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -681,12 +678,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-PAL:       ; %bb.0: ; %bb
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s32
-; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -1387,14 +1383,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v2, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1447,15 +1442,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
 ; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX1010-PAL-NEXT:    scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -1472,14 +1466,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
 ; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, 15
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX1030-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -1524,15 +1517,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
-; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s32 glc dlc
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
+; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1574,15 +1567,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-PAL:       ; %bb.0: ; %bb
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
 ; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
-; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
-; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s32 glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -2277,14 +2270,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v2, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2338,15 +2330,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
 ; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX1010-PAL-NEXT:    scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -2363,14 +2354,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
 ; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, 15
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX1030-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -2415,15 +2405,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s32 offset:4 glc dlc
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2467,15 +2457,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-PAL:       ; %bb.0: ; %bb
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
 ; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
-; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s32 offset:4 glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 8b88167d0ae1b..c4d72c832c8e9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -970,7 +970,7 @@ define amdgpu_kernel void @two_non_inline_constant(float addrspace(1)* %out, flo
 
 ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
 ; GCN-LABEL: {{^}}one_non_inline_constant:
-; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000
+; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41800000
 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]]
 define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
@@ -990,9 +990,8 @@ define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, flo
 }
 
 ; GCN-LABEL: {{^}}two_non_inline_constant_multi_use:
-; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000
 ; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000
-; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], [[K1]]
+; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x41800000
 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]]
 define amdgpu_kernel void @two_non_inline_constant_multi_use(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 76099a7d2c4d4..1cbb2c3ce1b99 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -97,9 +97,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %ou
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
-; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
-; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
+; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
+; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
 ; GCN: {{flat|global}}_store_dwordx2
 define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 6c6fcfa23ccbd..14525dba1b1c3 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -69,10 +69,9 @@ define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], d
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v2f64:
-; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}}
 ; GCN-NOT: 0x80000000
-; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
+; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
@@ -81,12 +80,11 @@ define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f64:
-; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}}
 ; GCN-NOT: 0x80000000
-; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
+; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
+; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
+; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index ef6201bac47e7..1d7b0450bd99a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -84,9 +84,8 @@ define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrs
 ; R600: -PV
 
 ; FIXME: In this case two uses of the constant should be folded
-; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}}
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
@@ -95,11 +94,10 @@ define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
-; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}}
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 9a8a8c08db888..291ca5f8fe57c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -18,9 +18,8 @@ define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) {
 ; R600: -PV
 ; R600: -PV
 
-; GCN: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1
-; GCN: s_xor_b32
-; GCN: s_xor_b32
+; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
+; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
 define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, <2 x float> addrspace(1)* %out
@@ -33,10 +32,10 @@ define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out
 ; R600: -PV
 ; R600: -PV
 
-; GCN: s_xor_b32
-; GCN: s_xor_b32
-; GCN: s_xor_b32
-; GCN: s_xor_b32
+; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
+; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
+; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
+; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
 define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, <4 x float> addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
index 079147083863b..4e63330211a7f 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
@@ -36,9 +36,10 @@ body:             |
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF]], 0, implicit $exec
-    ; GCN: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF1]], 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_1]], implicit [[V_ADD_CO_U32_e64_2]]
+    ; GCN: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
+    ; GCN: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc
+    ; GCN: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF1]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ADD_CO_U32_e32_1]]
     %0:sreg_32_xm0 = S_MOV_B32 12345
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 239750ed8c5b2..7b17b54becf38 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1344,16 +1344,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; SI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
 ; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
 ; SI-NEXT:    v_rcp_f32_e32 v6, v5
-; SI-NEXT:    s_mov_b32 s6, 3
-; SI-NEXT:    s_mov_b32 s7, 0
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
 ; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
 ; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
 ; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
 ; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
 ; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
 ; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
 ; SI-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1364,14 +1362,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; SI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
 ; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
 ; SI-NEXT:    v_rcp_f32_e32 v5, v4
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
 ; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
 ; SI-NEXT:    v_mul_f32_e32 v6, v2, v5
 ; SI-NEXT:    v_fma_f32 v7, -v4, v6, v2
 ; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
 ; SI-NEXT:    v_fma_f32 v2, -v4, v6, v2
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
 ; SI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
@@ -1398,8 +1396,6 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; CI-NEXT:    s_mov_b32 s11, s3
 ; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; CI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
-; CI-NEXT:    s_mov_b32 s6, 3
-; CI-NEXT:    s_mov_b32 s7, 0
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -1411,14 +1407,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
 ; CI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
 ; CI-NEXT:    v_rcp_f32_e32 v6, v5
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
 ; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
 ; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
 ; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
 ; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
 ; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
 ; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
 ; CI-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1429,14 +1425,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; CI-NEXT:    v_rcp_f32_e32 v5, v4
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
 ; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
 ; CI-NEXT:    v_mul_f32_e32 v6, v2, v5
 ; CI-NEXT:    v_fma_f32 v7, -v4, v6, v2
 ; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
 ; CI-NEXT:    v_fma_f32 v2, -v4, v6, v2
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
 ; CI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
 ; CI-NEXT:    v_trunc_f32_e32 v2, v2
@@ -1595,16 +1591,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; SI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
 ; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
 ; SI-NEXT:    v_rcp_f32_e32 v10, v9
-; SI-NEXT:    s_mov_b32 s6, 3
-; SI-NEXT:    s_mov_b32 s7, 0
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
 ; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
 ; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
 ; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
 ; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
 ; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
 ; SI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
 ; SI-NEXT:    v_trunc_f32_e32 v8, v8
@@ -1615,14 +1609,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; SI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
 ; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
 ; SI-NEXT:    v_rcp_f32_e32 v9, v8
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
 ; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
 ; SI-NEXT:    v_mul_f32_e32 v10, v5, v9
 ; SI-NEXT:    v_fma_f32 v11, -v8, v10, v5
 ; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
 ; SI-NEXT:    v_fma_f32 v5, -v8, v10, v5
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
 ; SI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
 ; SI-NEXT:    v_trunc_f32_e32 v5, v5
@@ -1632,14 +1626,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; SI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
 ; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
 ; SI-NEXT:    v_rcp_f32_e32 v7, v5
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
 ; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
 ; SI-NEXT:    v_mul_f32_e32 v8, v4, v7
 ; SI-NEXT:    v_fma_f32 v9, -v5, v8, v4
 ; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
 ; SI-NEXT:    v_fma_f32 v4, -v5, v8, v4
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
 ; SI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
 ; SI-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1649,14 +1643,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; SI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
 ; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
 ; SI-NEXT:    v_rcp_f32_e32 v5, v4
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
 ; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
 ; SI-NEXT:    v_mul_f32_e32 v7, v3, v5
 ; SI-NEXT:    v_fma_f32 v8, -v4, v7, v3
 ; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; SI-NEXT:    v_fma_f32 v3, -v4, v7, v3
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
 ; SI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
 ; SI-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1682,8 +1676,6 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; CI-NEXT:    s_mov_b32 s7, s3
 ; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_mov_b32 s11, s3
-; CI-NEXT:    s_mov_b32 s6, 3
-; CI-NEXT:    s_mov_b32 s7, 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -1702,14 +1694,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
 ; CI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
 ; CI-NEXT:    v_rcp_f32_e32 v10, v9
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
 ; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
 ; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
 ; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
 ; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
 ; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
 ; CI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
 ; CI-NEXT:    v_trunc_f32_e32 v8, v8
@@ -1720,14 +1712,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_rcp_f32_e32 v9, v8
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
 ; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
 ; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
 ; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
 ; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
 ; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
 ; CI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
 ; CI-NEXT:    v_trunc_f32_e32 v5, v5
@@ -1737,14 +1729,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; CI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; CI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
 ; CI-NEXT:    v_rcp_f32_e32 v7, v5
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
 ; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
 ; CI-NEXT:    v_mul_f32_e32 v8, v4, v7
 ; CI-NEXT:    v_fma_f32 v9, -v5, v8, v4
 ; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
 ; CI-NEXT:    v_fma_f32 v4, -v5, v8, v4
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
 ; CI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
 ; CI-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1754,14 +1746,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; CI-NEXT:    v_rcp_f32_e32 v5, v4
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
 ; CI-NEXT:    v_fma_f32 v5, v7, v5, v5
 ; CI-NEXT:    v_mul_f32_e32 v7, v3, v5
 ; CI-NEXT:    v_fma_f32 v8, -v4, v7, v3
 ; CI-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; CI-NEXT:    v_fma_f32 v3, -v4, v7, v3
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
 ; CI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
 ; CI-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1965,16 +1957,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; SI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
 ; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
 ; SI-NEXT:    v_rcp_f32_e32 v6, v5
-; SI-NEXT:    s_mov_b32 s6, 3
-; SI-NEXT:    s_mov_b32 s7, 0
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
 ; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
 ; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
 ; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
 ; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
 ; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
 ; SI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
 ; SI-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1982,14 +1972,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; SI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
 ; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
 ; SI-NEXT:    v_rcp_f32_e32 v5, v4
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
 ; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
 ; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
 ; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
 ; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
 ; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; SI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
 ; SI-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2014,20 +2004,18 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; CI-NEXT:    s_mov_b32 s11, s3
 ; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
-; CI-NEXT:    s_mov_b32 s6, 3
-; CI-NEXT:    s_mov_b32 s7, 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
 ; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
 ; CI-NEXT:    v_rcp_f32_e32 v6, v5
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
 ; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
 ; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
 ; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
 ; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
 ; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
 ; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
 ; CI-NEXT:    v_trunc_f32_e32 v4, v4
@@ -2035,14 +2023,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
 ; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
 ; CI-NEXT:    v_rcp_f32_e32 v5, v4
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
 ; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
 ; CI-NEXT:    v_mul_f32_e32 v6, v3, v5
 ; CI-NEXT:    v_fma_f32 v7, -v4, v6, v3
 ; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
 ; CI-NEXT:    v_fma_f32 v3, -v4, v6, v3
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
 ; CI-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2054,8 +2042,6 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT:    s_mov_b32 s2, 3
-; VI-NEXT:    s_mov_b32 s3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    s_add_u32 s0, s0, 32
@@ -2071,14 +2057,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v3
 ; VI-NEXT:    v_div_scale_f32 v6, vcc, v3, v5, v3
 ; VI-NEXT:    v_rcp_f32_e32 v8, v7
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; VI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
 ; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
 ; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
 ; VI-NEXT:    v_fma_f32 v10, -v7, v9, v6
 ; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
 ; VI-NEXT:    v_fma_f32 v6, -v7, v9, v6
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
 ; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v3
 ; VI-NEXT:    v_trunc_f32_e32 v6, v6
@@ -2086,14 +2072,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v2
 ; VI-NEXT:    v_div_scale_f32 v5, vcc, v2, v4, v2
 ; VI-NEXT:    v_rcp_f32_e32 v7, v6
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; VI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
 ; VI-NEXT:    v_fma_f32 v7, v8, v7, v7
 ; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
 ; VI-NEXT:    v_fma_f32 v9, -v6, v8, v5
 ; VI-NEXT:    v_fma_f32 v8, v9, v7, v8
 ; VI-NEXT:    v_fma_f32 v5, -v6, v8, v5
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
 ; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v2
 ; VI-NEXT:    v_trunc_f32_e32 v5, v5
@@ -2109,20 +2095,18 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
-; GFX9-NEXT:    s_mov_b32 s2, 3
-; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f32 v6, s[0:1], v3, v3, v1
 ; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX9-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
 ; GFX9-NEXT:    v_fma_f32 v7, v8, v7, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v8, v5, v7
 ; GFX9-NEXT:    v_fma_f32 v9, -v6, v8, v5
 ; GFX9-NEXT:    v_fma_f32 v8, v9, v7, v8
 ; GFX9-NEXT:    v_fma_f32 v5, -v6, v8, v5
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
 ; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
@@ -2130,14 +2114,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; GFX9-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v0
 ; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v6, v5
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX9-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
 ; GFX9-NEXT:    v_fma_f32 v6, v7, v6, v6
 ; GFX9-NEXT:    v_mul_f32_e32 v7, v3, v6
 ; GFX9-NEXT:    v_fma_f32 v8, -v5, v7, v3
 ; GFX9-NEXT:    v_fma_f32 v7, v8, v6, v7
 ; GFX9-NEXT:    v_fma_f32 v3, -v5, v7, v3
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
 ; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2219,16 +2203,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; SI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
 ; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
 ; SI-NEXT:    v_rcp_f32_e32 v10, v9
-; SI-NEXT:    s_mov_b32 s6, 3
-; SI-NEXT:    s_mov_b32 s7, 0
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
 ; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
 ; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
 ; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
 ; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
 ; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
 ; SI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
 ; SI-NEXT:    v_trunc_f32_e32 v8, v8
@@ -2236,14 +2218,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; SI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
 ; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
 ; SI-NEXT:    v_rcp_f32_e32 v9, v8
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
 ; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
 ; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
 ; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
 ; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
 ; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
 ; SI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
 ; SI-NEXT:    v_trunc_f32_e32 v7, v7
@@ -2251,14 +2233,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; SI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
 ; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
 ; SI-NEXT:    v_rcp_f32_e32 v8, v7
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
 ; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
 ; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
 ; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
 ; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
 ; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
 ; SI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
 ; SI-NEXT:    v_trunc_f32_e32 v6, v6
@@ -2266,14 +2248,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; SI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
 ; SI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
 ; SI-NEXT:    v_rcp_f32_e32 v7, v6
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; SI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
 ; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
 ; SI-NEXT:    v_mul_f32_e32 v8, v5, v7
 ; SI-NEXT:    v_fma_f32 v9, -v6, v8, v5
 ; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
 ; SI-NEXT:    v_fma_f32 v5, -v6, v8, v5
-; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
 ; SI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
 ; SI-NEXT:    v_trunc_f32_e32 v5, v5
@@ -2298,20 +2280,18 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; CI-NEXT:    s_mov_b32 s11, s3
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
-; CI-NEXT:    s_mov_b32 s6, 3
-; CI-NEXT:    s_mov_b32 s7, 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
 ; CI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
 ; CI-NEXT:    v_rcp_f32_e32 v10, v9
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
 ; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
 ; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
 ; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
 ; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
 ; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
 ; CI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
 ; CI-NEXT:    v_trunc_f32_e32 v8, v8
@@ -2319,14 +2299,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
 ; CI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
 ; CI-NEXT:    v_rcp_f32_e32 v9, v8
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
 ; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
 ; CI-NEXT:    v_mul_f32_e32 v10, v7, v9
 ; CI-NEXT:    v_fma_f32 v11, -v8, v10, v7
 ; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
 ; CI-NEXT:    v_fma_f32 v7, -v8, v10, v7
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
 ; CI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
 ; CI-NEXT:    v_trunc_f32_e32 v7, v7
@@ -2334,14 +2314,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; CI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
 ; CI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
 ; CI-NEXT:    v_rcp_f32_e32 v8, v7
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
 ; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
 ; CI-NEXT:    v_mul_f32_e32 v9, v6, v8
 ; CI-NEXT:    v_fma_f32 v10, -v7, v9, v6
 ; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
 ; CI-NEXT:    v_fma_f32 v6, -v7, v9, v6
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
 ; CI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
 ; CI-NEXT:    v_trunc_f32_e32 v6, v6
@@ -2349,14 +2329,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; CI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
 ; CI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
 ; CI-NEXT:    v_rcp_f32_e32 v7, v6
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; CI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
 ; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
 ; CI-NEXT:    v_mul_f32_e32 v8, v5, v7
 ; CI-NEXT:    v_fma_f32 v9, -v6, v8, v5
 ; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
 ; CI-NEXT:    v_fma_f32 v5, -v6, v8, v5
-; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
 ; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
 ; CI-NEXT:    v_trunc_f32_e32 v5, v5
@@ -2368,8 +2348,6 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT:    s_mov_b32 s2, 3
-; VI-NEXT:    s_mov_b32 s3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_add_u32 s0, s0, 64
@@ -2385,14 +2363,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v7, v7, v3
 ; VI-NEXT:    v_div_scale_f32 v10, vcc, v3, v7, v3
 ; VI-NEXT:    v_rcp_f32_e32 v12, v11
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
 ; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
 ; VI-NEXT:    v_mul_f32_e32 v13, v10, v12
 ; VI-NEXT:    v_fma_f32 v14, -v11, v13, v10
 ; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
 ; VI-NEXT:    v_fma_f32 v10, -v11, v13, v10
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
 ; VI-NEXT:    v_div_fixup_f32 v10, v10, v7, v3
 ; VI-NEXT:    v_trunc_f32_e32 v10, v10
@@ -2400,14 +2378,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; VI-NEXT:    v_div_scale_f32 v10, s[0:1], v6, v6, v2
 ; VI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
 ; VI-NEXT:    v_rcp_f32_e32 v11, v10
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; VI-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
 ; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
 ; VI-NEXT:    v_mul_f32_e32 v12, v7, v11
 ; VI-NEXT:    v_fma_f32 v13, -v10, v12, v7
 ; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
 ; VI-NEXT:    v_fma_f32 v7, -v10, v12, v7
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v7, v7, v11, v12
 ; VI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
 ; VI-NEXT:    v_trunc_f32_e32 v7, v7
@@ -2415,14 +2393,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
 ; VI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
 ; VI-NEXT:    v_rcp_f32_e32 v10, v7
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; VI-NEXT:    v_fma_f32 v11, -v7, v10, 1.0
 ; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
 ; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
 ; VI-NEXT:    v_fma_f32 v12, -v7, v11, v6
 ; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
 ; VI-NEXT:    v_fma_f32 v6, -v7, v11, v6
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
 ; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
 ; VI-NEXT:    v_trunc_f32_e32 v6, v6
@@ -2430,14 +2408,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
 ; VI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
 ; VI-NEXT:    v_rcp_f32_e32 v7, v6
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; VI-NEXT:    v_fma_f32 v10, -v6, v7, 1.0
 ; VI-NEXT:    v_fma_f32 v7, v10, v7, v7
 ; VI-NEXT:    v_mul_f32_e32 v10, v5, v7
 ; VI-NEXT:    v_fma_f32 v11, -v6, v10, v5
 ; VI-NEXT:    v_fma_f32 v10, v11, v7, v10
 ; VI-NEXT:    v_fma_f32 v5, -v6, v10, v5
-; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v10
 ; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
 ; VI-NEXT:    v_trunc_f32_e32 v5, v5
@@ -2453,20 +2431,18 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
-; GFX9-NEXT:    s_mov_b32 s2, 3
-; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f32 v10, s[0:1], v7, v7, v3
 ; GFX9-NEXT:    v_div_scale_f32 v9, vcc, v3, v7, v3
 ; GFX9-NEXT:    v_rcp_f32_e32 v11, v10
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX9-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
 ; GFX9-NEXT:    v_fma_f32 v11, v12, v11, v11
 ; GFX9-NEXT:    v_mul_f32_e32 v12, v9, v11
 ; GFX9-NEXT:    v_fma_f32 v13, -v10, v12, v9
 ; GFX9-NEXT:    v_fma_f32 v12, v13, v11, v12
 ; GFX9-NEXT:    v_fma_f32 v9, -v10, v12, v9
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX9-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
 ; GFX9-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
@@ -2474,14 +2450,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; GFX9-NEXT:    v_div_scale_f32 v9, s[0:1], v6, v6, v2
 ; GFX9-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
 ; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
 ; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
 ; GFX9-NEXT:    v_mul_f32_e32 v11, v7, v10
 ; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v7
 ; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
 ; GFX9-NEXT:    v_fma_f32 v7, -v9, v11, v7
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX9-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
 ; GFX9-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
@@ -2489,14 +2465,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; GFX9-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
 ; GFX9-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v9, v7
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX9-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
 ; GFX9-NEXT:    v_fma_f32 v9, v10, v9, v9
 ; GFX9-NEXT:    v_mul_f32_e32 v10, v6, v9
 ; GFX9-NEXT:    v_fma_f32 v11, -v7, v10, v6
 ; GFX9-NEXT:    v_fma_f32 v10, v11, v9, v10
 ; GFX9-NEXT:    v_fma_f32 v6, -v7, v10, v6
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
 ; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
@@ -2504,14 +2480,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; GFX9-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
 ; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX9-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
 ; GFX9-NEXT:    v_fma_f32 v7, v9, v7, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v9, v5, v7
 ; GFX9-NEXT:    v_fma_f32 v10, -v6, v9, v5
 ; GFX9-NEXT:    v_fma_f32 v9, v10, v7, v9
 ; GFX9-NEXT:    v_fma_f32 v5, -v6, v9, v5
-; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
 ; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
@@ -2636,16 +2612,15 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 ; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
 ; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
 ; SI-NEXT:    v_bfe_u32 v10, v9, 20, 11
-; SI-NEXT:    s_movk_i32 s8, 0xfc01
-; SI-NEXT:    v_add_i32_e32 v12, vcc, s8, v10
+; SI-NEXT:    v_add_i32_e32 v12, vcc, 0xfffffc01, v10
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
 ; SI-NEXT:    v_lshr_b64 v[10:11], s[2:3], v12
 ; SI-NEXT:    v_not_b32_e32 v10, v10
 ; SI-NEXT:    v_and_b32_e32 v10, v8, v10
 ; SI-NEXT:    v_not_b32_e32 v11, v11
 ; SI-NEXT:    v_and_b32_e32 v11, v9, v11
-; SI-NEXT:    s_brev_b32 s9, 1
-; SI-NEXT:    v_and_b32_e32 v13, s9, v9
+; SI-NEXT:    s_brev_b32 s8, 1
+; SI-NEXT:    v_and_b32_e32 v13, s8, v9
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v12
@@ -2669,13 +2644,13 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 ; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
 ; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
 ; SI-NEXT:    v_bfe_u32 v8, v7, 20, 11
-; SI-NEXT:    v_add_i32_e32 v10, vcc, s8, v8
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 0xfffffc01, v8
 ; SI-NEXT:    v_lshr_b64 v[8:9], s[2:3], v10
 ; SI-NEXT:    v_not_b32_e32 v8, v8
 ; SI-NEXT:    v_and_b32_e32 v8, v6, v8
 ; SI-NEXT:    v_not_b32_e32 v9, v9
 ; SI-NEXT:    v_and_b32_e32 v9, v7, v9
-; SI-NEXT:    v_and_b32_e32 v11, s9, v7
+; SI-NEXT:    v_and_b32_e32 v11, s8, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 994cc78ec7917..a3058fc1f4441 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -716,10 +716,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -929,37 +928,36 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v7
-; GFX10-NEXT:    v_lshrrev_b16 v7, v7, v8
-; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_lshlrev_b16 v6, v9, v6
-; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshlrev_b16 v12, 1, v12
-; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v11
-; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
+; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v5
+; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v12, -1, v4
+; GFX10-NEXT:    v_lshrrev_b16 v6, v7, v6
+; GFX10-NEXT:    v_lshlrev_b16 v8, 1, v8
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_lshlrev_b16 v10, 1, v10
+; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v9
+; GFX10-NEXT:    v_lshlrev_b16 v1, v11, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v12, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
-; GFX10-NEXT:    v_lshlrev_b16 v1, v9, v1
 ; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
-; GFX10-NEXT:    v_lshrrev_b16 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b16 v5, v13, v12
+; GFX10-NEXT:    v_lshlrev_b16 v4, v7, v8
+; GFX10-NEXT:    v_lshrrev_b16 v5, v9, v13
+; GFX10-NEXT:    v_lshlrev_b16 v7, v14, v10
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX10-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, v2, v1
-; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
-; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v7, v5
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
   ret <4 x i16> %ret
@@ -1241,14 +1239,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b32 s4, 0xffffff
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, s4, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, s4, v5
-; GFX10-NEXT:    s_mov_b32 s4, 0xaaaaaaab
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v6, v6, s4
-; GFX10-NEXT:    v_mul_hi_u32 v7, v7, s4
+; GFX10-NEXT:    v_mul_hi_u32 v6, 0xaaaaaaab, v6
+; GFX10-NEXT:    v_mul_hi_u32 v7, 0xaaaaaaab, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v6, 24, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 3d88dfc7ddd62..56707bc7c6e47 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -406,18 +406,17 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_movk_i32 s5, 0x400
+; GFX9-NEXT:    s_movk_i32 s4, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s4, s2
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX9-NEXT:  .LBB4_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -429,7 +428,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
 ; GFX9-NEXT:    v_mad_f32 v0, -v0, v2, v8
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    global_store_short v[5:6], v0, off
@@ -442,16 +441,15 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s0, s1, s4
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s4
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX10-NEXT:  .LBB4_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v0, s1, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v7, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
@@ -491,17 +489,16 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_movk_i32 s8, 0x400
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s7, s6, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s7
+; GFX9-NEXT:    s_movk_i32 s7, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX9-NEXT:  .LBB5_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v0, s6, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
@@ -511,9 +508,9 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v9
 ; GFX9-NEXT:    v_mad_f32 v8, -v9, v2, v8
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v8|, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s8, v4
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s7, v4
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v8, v8, s7
+; GFX9-NEXT:    v_mul_lo_u32 v8, v8, s6
 ; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v8
@@ -527,16 +524,15 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s4, s1, s4
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX10-NEXT:  .LBB5_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v0, s1, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v7, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
@@ -549,7 +545,7 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
-; GFX10-NEXT:    v_mul_lo_u32 v7, v7, s4
+; GFX10-NEXT:    v_mul_lo_u32 v7, v7, s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
 ; GFX10-NEXT:    global_store_short v[5:6], v0, off
 ; GFX10-NEXT:    s_cbranch_vccz .LBB5_1

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 597ed577c16ea..1b4a181f7e127 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -2104,16 +2104,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v0, s3, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s3, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v2
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 4aa58fbc42f51..228c17bb5e80a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -213,6 +213,7 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -226,7 +227,6 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_and_b32_e32 v6, s0, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
@@ -315,7 +315,7 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v8, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
@@ -323,17 +323,17 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s0, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, s0, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, s0, v5
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xff, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
@@ -1208,16 +1208,15 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v0, s3, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xff, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s3, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v0, v3
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1511,6 +1510,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1524,7 +1524,6 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
 ; GFX8-NEXT:    v_and_b32_e32 v8, s0, v8
@@ -1613,7 +1612,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v7, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
@@ -1625,16 +1624,16 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX10-DL-NEXT:    v_bfe_i32 v8, v2, 0, 8
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
-; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v8, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
@@ -1802,18 +1801,18 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v0, v0, v3
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
@@ -1901,6 +1900,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1914,7 +1914,6 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v3
@@ -2015,7 +2014,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
@@ -2023,27 +2022,27 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v5, 8, v1
+; GFX10-DL-NEXT:    v_lshrrev_b16 v6, 8, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v6, 8, v2
-; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshrrev_b16 v7, 8, v2
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
+; GFX10-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v10
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v9
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v6
-; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v7
-; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xffff, v9
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
+; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v6
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v4
+; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 8563d321c83ae..eacbd6f0e2e60 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -2519,7 +2519,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
@@ -2533,79 +2532,79 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-XNACK-NEXT:    global_load_ushort v3, v0, s[0:1]
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v1
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 4, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v2
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 8, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 12, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 4, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v2
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v13, v4, v13
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v6, v4, v6
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 12, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v12, v12, 16, v13
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 20, v1
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 20, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v14
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v13, v4, v15
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v8, v4, v8
-; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v5, v5, v12
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v1
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v18, 28, v2
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v11, v11, 16, v12
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 20, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v17
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v6, v6, 16, v13
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v13
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v4, v4, v11
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v13, 12, v16
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v5, v5, 16, v12
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v5, v3
+; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v4, v3
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v16
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v15
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v5, v4, v14
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v10, v4, v10
-; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v6, v7, v6
-; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v8
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v18, 12, v18
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v4, 0xffff, v13
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v5, v6, v5
+; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v5, v12, 16, v5
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v7, v9, 16, v10
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v6
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v18
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v2, v4, v2
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v4, v7, v5
-; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v8
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v4, v11, 16, v4
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v6, v8, 16, v9
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v5
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v17
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v4, v6, v4
+; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v4
 ; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
@@ -2622,7 +2621,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
@@ -2635,79 +2633,79 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-DL-NOXNACK-NEXT:    global_load_ushort v3, v2, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 4, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 8, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 12, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 4, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 8, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v13, v4, v13
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v6, v4, v6
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 12, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v12, v12, 16, v13
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 20, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 20, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v14
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v13, v4, v15
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v8, v4, v8
-; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v5, v5, v12
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v18, 28, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v11, v11, 16, v12
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 20, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 20, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v17
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v6, v6, 16, v13
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v13
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v4, v11
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 28, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v16
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v5, v5, 16, v12
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v5, v3
+; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v4, v3
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v12, 12, v16
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v5, v4, v14
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v10, v4, v10
-; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v6, v7, v6
-; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v18, 12, v18
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v4, 0xffff, v13
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v5, v6, v5
+; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v5, v12, 16, v5
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v7, v9, 16, v10
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v6
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v18
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v0, v4, v0
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v7, v5
-; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v4, v11, 16, v4
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v6, v8, 16, v9
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v5
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v17
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v6, v4
+; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v0, v1, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 0b99cb5cdc813..7994f83fa4313 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -2355,7 +2355,6 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s10, -1
@@ -2369,49 +2368,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v2
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, v4, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v9, v4, v9
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v7
-; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v10, 16, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 12, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v13, v4, v13
-; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 16, 4
-; GFX10-DL-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 16, 4
-; GFX10-DL-NEXT:    v_lshl_or_b32 v10, v10, 16, v13
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 20, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v12, v4, v12
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 4, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 8, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v6
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v9, 16, v5
+; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 12, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT:    v_bfe_u32 v5, v2, 16, 4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v12
+; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v2, 20, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, v4, v6
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v8, v8, v10
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 28, v1
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
-; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 24, 4
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v11, 16, v12
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 20, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v7, v7, v9
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
+; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 24, 4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v10, 16, v11
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v7
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v8, v4, v9
-; GFX10-DL-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v6, v5
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v10
-; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v8
-; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v7
+; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
@@ -3112,7 +3111,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s10, -1
@@ -3126,49 +3124,49 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v2
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, v4, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v9, v4, v9
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v7
-; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v10, 16, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 12, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v13, v4, v13
-; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 16, 4
-; GFX10-DL-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 16, 4
-; GFX10-DL-NEXT:    v_lshl_or_b32 v10, v10, 16, v13
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 20, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v12, v4, v12
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 4, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 8, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v6
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v9, 16, v5
+; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 12, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT:    v_bfe_u32 v5, v2, 16, 4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v12
+; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v2, 20, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, v4, v6
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v8, v8, v10
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 28, v1
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
-; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 24, 4
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v11, 16, v12
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 20, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v7, v7, v9
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
+; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 24, 4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v10, 16, v11
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v7
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v8, v4, v9
-; GFX10-DL-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v6, v5
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v10
-; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v8
-; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v7
+; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index 90c248db9877e..2cc6ea123fe69 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -341,12 +341,11 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x01,0x00,0x08]
 ; GFX9: buffer_store_dword [[REG]]
 
-; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding
 ; VI-DAG: buffer_load_dword
 ; VI-NOT: and
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0x6400, v{{[0-9]+}}
 ; gfx8 does not support sreg or imm in sdwa - this will be move then
-; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
+; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x6400
 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: buffer_store_dword

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 624a6caf94872..a0b20f4788abf 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -481,9 +481,8 @@ define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %v
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 3
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; GCN-NEXT:    s_mov_b32 s6, 0x1010101
-; GCN-NEXT:    s_and_b32 s7, s5, s6
-; GCN-NEXT:    s_and_b32 s6, s4, s6
+; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
+; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
 ; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GCN-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 4de859f473dfc..1b13cc34f3ef8 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1072,9 +1072,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
 ; SI-NEXT:    s_lshl_b32 s8, s6, 4
 ; SI-NEXT:    s_mov_b64 s[6:7], 0xffff
 ; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
-; SI-NEXT:    s_mov_b32 s8, 0x50005
-; SI-NEXT:    s_and_b32 s9, s7, s8
-; SI-NEXT:    s_and_b32 s8, s6, s8
+; SI-NEXT:    s_and_b32 s9, s7, 0x50005
+; SI-NEXT:    s_and_b32 s8, s6, 0x50005
 ; SI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
 ; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s5
@@ -1248,9 +1247,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
 ; SI-NEXT:    s_lshl_b32 s8, s8, 3
 ; SI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; SI-NEXT:    s_mov_b32 s8, 0x5050505
-; SI-NEXT:    s_and_b32 s9, s3, s8
-; SI-NEXT:    s_and_b32 s8, s2, s8
+; SI-NEXT:    s_and_b32 s9, s3, 0x5050505
+; SI-NEXT:    s_and_b32 s8, s2, 0x5050505
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
@@ -1272,9 +1270,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
 ; VI-NEXT:    s_lshl_b32 s8, s8, 3
 ; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; VI-NEXT:    s_mov_b32 s8, 0x5050505
-; VI-NEXT:    s_and_b32 s9, s3, s8
-; VI-NEXT:    s_and_b32 s8, s2, s8
+; VI-NEXT:    s_and_b32 s9, s3, 0x5050505
+; VI-NEXT:    s_and_b32 s8, s2, 0x5050505
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index a80604fde3ce0..ee401344580d6 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1606,7 +1606,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    s_lshl_b32 s1, s4, 16
-; VI-NEXT:    s_and_b32 s4, s4, s2
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    s_or_b32 s0, s4, s1
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1633,7 +1633,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    s_lshl_b32 s1, s4, 16
-; CI-NEXT:    s_and_b32 s4, s4, s2
+; CI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_or_b32 s0, s4, s1
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1691,7 +1691,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    s_lshl_b32 s1, s5, 4
 ; VI-NEXT:    s_lshl_b32 s5, s4, 16
-; VI-NEXT:    s_and_b32 s4, s4, s2
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
 ; VI-NEXT:    s_or_b32 s2, s4, s5
@@ -1716,7 +1716,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_and_b32 s6, s4, s2
+; CI-NEXT:    s_and_b32 s6, s4, 0xffff
 ; CI-NEXT:    s_lshl_b32 s1, s5, 4
 ; CI-NEXT:    s_lshl_b32 s4, s4, 16
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
index 0ae12149de211..a5129279ddef4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
@@ -37,11 +37,10 @@ main_body:
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index b3c66ef9284c6..7367f9c315661 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -566,10 +566,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ;
 ; GFX10-LABEL: sample_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v6, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
@@ -602,12 +601,11 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v6
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v3, v2, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX10-NEXT:    v_lshl_or_b32 v11, v7, 16, v5
+; GFX10-NEXT:    v_lshl_or_b32 v11, v7, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
@@ -653,10 +651,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ;
 ; GFX10-LABEL: sample_c_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v5, v7, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
@@ -705,10 +702,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10-LABEL: sample_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v7, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
@@ -759,10 +755,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10-LABEL: sample_c_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v5, v8, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
@@ -807,10 +802,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ;
 ; GFX10-LABEL: sample_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v6, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
@@ -857,10 +851,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10-LABEL: sample_c_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v5, v7, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
@@ -909,10 +902,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ;
 ; GFX10-LABEL: sample_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v7, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
@@ -963,10 +955,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10-LABEL: sample_c_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v5, v8, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
@@ -1160,15 +1151,14 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v13, v8
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v6
-; GFX10-NEXT:    v_and_b32_e32 v4, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_lshl_or_b32 v12, v7, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v11, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v10, v3, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_lshl_or_b32 v12, v7, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v11, v5, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v10, v3, 16, v2
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1197,15 +1187,14 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v13, v8
-; GFX10-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v6
-; GFX10-NEXT:    v_and_b32_e32 v4, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_lshl_or_b32 v12, v7, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v11, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v10, v3, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_lshl_or_b32 v12, v7, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v11, v5, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v10, v3, 16, v2
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
index d23caa43bbb62..e3f2269bbe0b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
@@ -62,11 +62,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ;
 ; GFX10GISEL-LABEL: sample_d_3d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v6, v6, v9, v7
-; GFX10GISEL-NEXT:    v_and_or_b32 v7, v8, v9, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v6, 0xffff, v6, v7
+; GFX10GISEL-NEXT:    v_and_or_b32 v7, 0xffff, v8, s12
 ; GFX10GISEL-NEXT:    image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -152,11 +151,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10GISEL-LABEL: sample_d_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v4, v4, v7, v5
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v6, v7, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, 0xffff, v4, v5
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, 0xffff, v6, s12
 ; GFX10GISEL-NEXT:    image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -203,11 +201,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10GISEL-LABEL: sample_c_d_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v5, v8, v6
-; GFX10GISEL-NEXT:    v_and_or_b32 v6, v7, v8, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, 0xffff, v5, v6
+; GFX10GISEL-NEXT:    v_and_or_b32 v6, 0xffff, v7, s12
 ; GFX10GISEL-NEXT:    image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -333,11 +330,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ;
 ; GFX10GISEL-LABEL: sample_cd_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v4, v4, v7, v5
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v6, v7, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, 0xffff, v4, v5
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, 0xffff, v6, s12
 ; GFX10GISEL-NEXT:    image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -384,11 +380,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10GISEL-LABEL: sample_c_cd_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v5, v8, v6
-; GFX10GISEL-NEXT:    v_and_or_b32 v6, v7, v8, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, 0xffff, v5, v6
+; GFX10GISEL-NEXT:    v_and_or_b32 v6, 0xffff, v7, s12
 ; GFX10GISEL-NEXT:    image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -415,11 +410,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v6, v6, v9, v7
-; GFX10GISEL-NEXT:    v_and_or_b32 v7, v8, v9, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v6, 0xffff, v6, v7
+; GFX10GISEL-NEXT:    v_and_or_b32 v7, 0xffff, v8, s12
 ; GFX10GISEL-NEXT:    image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -446,11 +440,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v6, v6, v9, v7
-; GFX10GISEL-NEXT:    v_and_or_b32 v7, v8, v9, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v6, 0xffff, v6, v7
+; GFX10GISEL-NEXT:    v_and_or_b32 v7, 0xffff, v8, s12
 ; GFX10GISEL-NEXT:    image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -490,10 +483,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_1d(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_d_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v3, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v3, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
 ; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -505,9 +497,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_g16_noa16_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -516,11 +507,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_d_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v6, v1
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v2, v6, v3
+; GFX10GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -534,10 +524,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v9, v2, v9
-; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v9
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -547,14 +536,13 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x
 ; GFX10GISEL:       ; %bb.0: ; %main_body
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v3, v9, v11, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v0, v11, v1
-; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v11, v4
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v5, v11, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v3, 0xffff, v9, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v2, 0xffff, v0, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, 0xffff, v10, v4
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, 0xffff, v5, s12
 ; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -572,10 +560,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_1d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v4, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v2, v4, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v2, 0xffff, v2, s12
 ; GFX10GISEL-NEXT:    image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -587,9 +574,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_g16_noa16_c_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -598,11 +584,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v7, v2
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v3, v7, v4
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10GISEL-NEXT:    v_and_or_b32 v2, 0xffff, v3, v4
 ; GFX10GISEL-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -620,10 +605,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_1d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v4, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v4, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
 ; GFX10GISEL-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -635,9 +619,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_g16_noa16_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -646,11 +629,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v7, v1
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v2, v7, v3
+; GFX10GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10GISEL-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -668,10 +650,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_1d(<8 x i32> inreg %rsrc,
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v5, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v2, v5, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v2, 0xffff, v2, s12
 ; GFX10GISEL-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -685,11 +666,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc,
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -699,11 +679,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc,
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX10GISEL-NEXT:    v_and_or_b32 v4, v9, v0, v4
-; GFX10GISEL-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, 0xffff, v9, v4
+; GFX10GISEL-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
 ; GFX10GISEL-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -721,10 +700,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_1d(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_cd_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v3, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v3, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
 ; GFX10GISEL-NEXT:    image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -736,9 +714,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_g16_noa16_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -747,11 +724,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v6, v1
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v2, v6, v3
+; GFX10GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10GISEL-NEXT:    image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -769,10 +745,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_1d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v4, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v2, v4, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v2, 0xffff, v2, s12
 ; GFX10GISEL-NEXT:    image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -784,9 +759,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_g16_noa16_c_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -795,11 +769,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v7, v2
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v3, v7, v4
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10GISEL-NEXT:    v_and_or_b32 v2, 0xffff, v3, v4
 ; GFX10GISEL-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -817,10 +790,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_1d(<8 x i32> inreg %rsrc, <
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v4, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v4, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
 ; GFX10GISEL-NEXT:    image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -832,9 +804,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_g16_noa16_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -843,11 +814,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v7, v1
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v2, v7, v3
+; GFX10GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
 ; GFX10GISEL-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -865,10 +835,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_1d(<8 x i32> inreg %rsrc,
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v5, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v2, v5, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v1, 0xffff, v1, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v2, 0xffff, v2, s12
 ; GFX10GISEL-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -882,11 +851,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc,
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v1
 ; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -896,11 +864,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc,
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX10GISEL-NEXT:    v_and_or_b32 v4, v9, v0, v4
-; GFX10GISEL-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, 0xffff, v9, v4
+; GFX10GISEL-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
 ; GFX10GISEL-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -913,14 +880,13 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc,
 ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9
-; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v9
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -932,11 +898,10 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc,
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v0, v1
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, 0xffff, v10, v0
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, 0xffff, v11, v1
 ; GFX10GISEL-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -949,14 +914,13 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg
 ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9
-; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v9
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -968,11 +932,10 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v0, v1
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, 0xffff, v10, v0
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, 0xffff, v11, v1
 ; GFX10GISEL-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
index 75990337c8a38..ffab03057c2bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@@ -15,9 +15,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
@@ -33,10 +32,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff ; encoding: [0xff,0x02,0x04,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v9, v2, v9 ; encoding: [0x02,0x13,0x12,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0 ; encoding: [0x02,0x01,0x00,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v9 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x25,0x04]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x09,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
@@ -60,9 +58,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
 ; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
@@ -87,9 +84,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
 ; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
@@ -116,11 +112,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04]
 ; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -143,9 +138,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
 ; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
@@ -170,9 +164,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
 ; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
@@ -197,9 +190,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
 ; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
@@ -226,11 +218,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04]
 ; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -243,14 +234,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04]
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -263,14 +253,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04]
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
index e0a18eb9ae66f..ec12dffc89986 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@@ -15,9 +15,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -33,10 +32,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v9, v2, v9
-; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v9
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -60,9 +58,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -87,9 +84,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -116,11 +112,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -143,9 +138,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -170,9 +164,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -197,9 +190,8 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -226,11 +218,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v1
 ; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -243,14 +234,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9
-; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v9
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -263,14 +253,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9
-; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v9
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
index 08551a22e6aa1..9540524b9d2ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
@@ -31,10 +31,9 @@ main_body:
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
@@ -56,11 +55,10 @@ main_body:
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index fec7954a02255..bd30de55ae0c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -35,10 +35,9 @@ main_body:
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}},
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
@@ -60,11 +59,10 @@ main_body:
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}},
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
index 30361a2b36ed3..f5b36f9a25c36 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
@@ -31,10 +31,9 @@ main_body:
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
@@ -56,11 +55,10 @@ main_body:
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index 5d05c955ddd0f..8a1819d9a292c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -35,10 +35,9 @@ main_body:
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
@@ -59,11 +58,10 @@ main_body:
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
index 9da9ca2cc99e9..bcd7c4a71b294 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
@@ -31,10 +31,9 @@ main_body:
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
-; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
+; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
@@ -53,11 +52,10 @@ main_body:
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
 ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
-; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
index 0b47c934ee792..e3572de13faef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
@@ -33,8 +33,8 @@ entry:
 ; VI:     flat_load_dword v[[A_F16_0:[0-9]+]]
 ; GFX9:   global_load_dword v[[A_F16_0:[0-9]+]]
 ; SI:     s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218
-; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c
-; VI:     v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]]
+; GFX9:   s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c
+; VI:     v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c
 ; SI:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
 ; SI:     v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]]
 ; SI:     v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
@@ -49,7 +49,8 @@ entry:
 ; VI:     v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
 ; VI:     v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9:   v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
-; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
+; VI:     v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]]
+; GFX9:   v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
 ; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
 ; SI-NOT: v_and_b32_e32
 ; SI:     v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
index 2851b270d93b9..d5cb21cfd0a76 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
@@ -33,8 +33,8 @@ entry:
 ; VI:     flat_load_dword v[[A_F16_0:[0-9]+]]
 ; GFX9:   global_load_dword v[[A_F16_0:[0-9]+]]
 ; SI:     s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a
-; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1
-; VI:     v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]]
+; GFX9:   s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1
+; VI:     v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1
 ; SI:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
 ; SI:     v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]]
 ; SI:     v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
@@ -48,8 +48,9 @@ entry:
 ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI:     v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
 ; VI:     v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI:     v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]]
 ; GFX9:   v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
-; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
+; GFX9:   v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
 ; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
 ; SI-NOT: v_and_b32_e32
 ; SI:     v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index f73ed62f20f82..995379cdf5b13 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -147,21 +147,19 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s7, 0xfc01
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
+; SI-NEXT:    s_mov_b32 s2, s6
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s14, s0, s7
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s14
-; SI-NEXT:    s_brev_b32 s15, 1
+; SI-NEXT:    s_add_i32 s7, s0, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[10:11], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s11, s15
-; SI-NEXT:    s_cmp_lt_i32 s14, 0
+; SI-NEXT:    s_and_b32 s0, s11, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s13
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s14, 51
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -172,23 +170,23 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s7, s0, s7
-; SI-NEXT:    s_brev_b32 s10, -2
+; SI-NEXT:    s_add_i32 s10, s0, 0xfffffc01
+; SI-NEXT:    s_brev_b32 s7, -2
 ; SI-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s11
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
-; SI-NEXT:    v_bfi_b32 v4, s10, v6, v4
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s10
+; SI-NEXT:    v_bfi_b32 v4, s7, v6, v4
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s9, s15
+; SI-NEXT:    s_and_b32 s0, s9, 0x80000000
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cmp_lt_i32 s10, 0
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cmp_gt_i32 s10, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -200,7 +198,7 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 ; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v7, s9
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; SI-NEXT:    v_bfi_b32 v6, s10, v6, v7
+; SI-NEXT:    v_bfi_b32 v6, s7, v6, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
@@ -245,22 +243,20 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
 ; SI-NEXT:    s_mov_b32 s14, -1
-; SI-NEXT:    s_movk_i32 s18, 0xfc01
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
+; SI-NEXT:    s_mov_b32 s2, s14
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
-; SI-NEXT:    s_add_i32 s19, s0, s18
-; SI-NEXT:    s_mov_b32 s2, s14
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s19
-; SI-NEXT:    s_brev_b32 s20, 1
+; SI-NEXT:    s_add_i32 s18, s0, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s18
 ; SI-NEXT:    s_andn2_b64 s[16:17], s[6:7], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s7, s20
-; SI-NEXT:    s_cmp_lt_i32 s19, 0
+; SI-NEXT:    s_and_b32 s0, s7, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s18, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s17
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s19, 51
+; SI-NEXT:    s_cmp_gt_i32 s18, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -271,7 +267,7 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s5, 0xb0014
-; SI-NEXT:    s_add_i32 s17, s0, s18
+; SI-NEXT:    s_add_i32 s17, s0, 0xfffffc01
 ; SI-NEXT:    s_brev_b32 s16, -2
 ; SI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
@@ -279,7 +275,7 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s17
 ; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
 ; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s5, s20
+; SI-NEXT:    s_and_b32 s0, s5, 0x80000000
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_cmp_lt_i32 s17, 0
@@ -298,12 +294,12 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
 ; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
-; SI-NEXT:    s_add_i32 s6, s0, s18
+; SI-NEXT:    s_add_i32 s6, s0, 0xfffffc01
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s6
 ; SI-NEXT:    v_mov_b32_e32 v6, s5
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
 ; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s11, s20
+; SI-NEXT:    s_and_b32 s0, s11, 0x80000000
 ; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc
@@ -321,13 +317,13 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
 ; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
 ; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s4, s0, s18
+; SI-NEXT:    s_add_i32 s4, s0, 0xfffffc01
 ; SI-NEXT:    v_mov_b32_e32 v10, s11
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s4
 ; SI-NEXT:    v_bfi_b32 v10, s16, v12, v10
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s9, s20
+; SI-NEXT:    s_and_b32 s0, s9, 0x80000000
 ; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
 ; SI-NEXT:    v_mov_b32_e32 v6, 0
 ; SI-NEXT:    s_cmp_lt_i32 s4, 0
@@ -412,21 +408,20 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
 ; SI-NEXT:    s_mov_b32 s22, -1
-; SI-NEXT:    s_movk_i32 s28, 0xfc01
 ; SI-NEXT:    s_mov_b32 s21, 0xfffff
 ; SI-NEXT:    s_mov_b32 s20, s22
+; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_u32 s2, s7, 0xb0014
-; SI-NEXT:    s_add_i32 s23, s2, s28
-; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s23
-; SI-NEXT:    s_brev_b32 s29, 1
+; SI-NEXT:    s_add_i32 s26, s2, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s26
+; SI-NEXT:    s_and_b32 s23, s7, 0x80000000
 ; SI-NEXT:    s_andn2_b64 s[24:25], s[6:7], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s7, s29
-; SI-NEXT:    s_cmp_lt_i32 s23, 0
+; SI-NEXT:    s_cmp_lt_i32 s26, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s25
-; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s23
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s23, 51
+; SI-NEXT:    s_cmp_gt_i32 s26, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -437,15 +432,14 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
 ; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s2, s5, 0xb0014
-; SI-NEXT:    s_add_i32 s24, s2, s28
+; SI-NEXT:    s_add_i32 s24, s2, 0xfffffc01
 ; SI-NEXT:    s_brev_b32 s23, -2
-; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s24
 ; SI-NEXT:    v_bfi_b32 v4, s23, v8, v4
 ; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s5, s29
+; SI-NEXT:    s_and_b32 s2, s5, 0x80000000
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_cmp_lt_i32 s24, 0
@@ -464,13 +458,13 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
 ; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s2, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s2, s28
+; SI-NEXT:    s_add_i32 s6, s2, 0xfffffc01
 ; SI-NEXT:    v_mov_b32_e32 v6, s5
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
 ; SI-NEXT:    v_bfi_b32 v6, s23, v8, v6
 ; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s11, s29
+; SI-NEXT:    s_and_b32 s2, s11, 0x80000000
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
@@ -489,13 +483,13 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[2:3]
 ; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
 ; SI-NEXT:    s_bfe_u32 s2, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s2, s28
+; SI-NEXT:    s_add_i32 s6, s2, 0xfffffc01
 ; SI-NEXT:    v_mov_b32_e32 v9, s11
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
 ; SI-NEXT:    v_bfi_b32 v9, s23, v8, v9
 ; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s9, s29
+; SI-NEXT:    s_and_b32 s2, s9, 0x80000000
 ; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; SI-NEXT:    v_mov_b32_e32 v6, 0
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
@@ -514,12 +508,12 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[2:3]
 ; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
 ; SI-NEXT:    v_add_f64 v[9:10], s[8:9], -v[4:5]
-; SI-NEXT:    s_add_i32 s4, s2, s28
+; SI-NEXT:    s_add_i32 s4, s2, 0xfffffc01
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s4
 ; SI-NEXT:    v_mov_b32_e32 v11, s9
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5
 ; SI-NEXT:    s_andn2_b64 s[24:25], s[14:15], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s15, s29
+; SI-NEXT:    s_and_b32 s2, s15, 0x80000000
 ; SI-NEXT:    v_bfi_b32 v11, s23, v8, v11
 ; SI-NEXT:    s_cmp_lt_i32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v10, 0, v11, vcc
@@ -530,10 +524,10 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_mov_b32_e32 v10, s2
 ; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; SI-NEXT:    s_bfe_u32 s4, s13, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s4, s28
+; SI-NEXT:    s_add_i32 s6, s4, 0xfffffc01
 ; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s6
 ; SI-NEXT:    s_andn2_b64 s[26:27], s[12:13], s[4:5]
-; SI-NEXT:    s_and_b32 s4, s13, s29
+; SI-NEXT:    s_and_b32 s4, s13, 0x80000000
 ; SI-NEXT:    v_mov_b32_e32 v9, s25
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v15, v9, v10, vcc
@@ -542,30 +536,30 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; SI-NEXT:    s_bfe_u32 s8, s19, 0xb0014
-; SI-NEXT:    s_add_i32 s25, s8, s28
-; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s25
-; SI-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[8:9]
-; SI-NEXT:    s_and_b32 s8, s19, s29
+; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s10
+; SI-NEXT:    s_andn2_b64 s[28:29], s[18:19], s[8:9]
+; SI-NEXT:    s_and_b32 s8, s19, 0x80000000
 ; SI-NEXT:    v_mov_b32_e32 v9, s27
-; SI-NEXT:    s_cmp_lt_i32 s25, 0
+; SI-NEXT:    s_cmp_lt_i32 s10, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v17, v9, v10, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v9, s11
+; SI-NEXT:    v_mov_b32_e32 v9, s29
 ; SI-NEXT:    v_mov_b32_e32 v10, s8
 ; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s25, 51
+; SI-NEXT:    s_cmp_gt_i32 s10, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v10, s19
-; SI-NEXT:    v_mov_b32_e32 v11, s10
 ; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v10, v9, v10, s[10:11]
-; SI-NEXT:    v_cndmask_b32_e64 v9, v11, 0, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v9, s28
+; SI-NEXT:    v_cndmask_b32_e64 v9, v9, 0, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v11, s18
 ; SI-NEXT:    s_bfe_u32 s8, s17, 0xb0014
 ; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[10:11]
-; SI-NEXT:    s_add_i32 s10, s8, s28
+; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
 ; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s10
 ; SI-NEXT:    s_andn2_b64 s[20:21], s[16:17], s[8:9]
-; SI-NEXT:    s_and_b32 s8, s17, s29
+; SI-NEXT:    s_and_b32 s8, s17, 0x80000000
 ; SI-NEXT:    s_cmp_lt_i32 s10, 0
 ; SI-NEXT:    v_mov_b32_e32 v11, s21
 ; SI-NEXT:    v_mov_b32_e32 v12, s8

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 23150da4e53bc..f0fece24c7031 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1027,31 +1027,29 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s6
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s6
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s0, s2, 16
-; GCN-HSA-NEXT:    s_and_b32 s1, s3, s4
-; GCN-HSA-NEXT:    s_and_b32 s2, s2, s4
+; GCN-HSA-NEXT:    s_and_b32 s1, s3, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s1
@@ -1061,21 +1059,18 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32:
 ; GCN-NOHSA-VI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s0, s3, s8
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s1, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s2, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v3i16_to_v3i32:
@@ -1202,12 +1197,11 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s2
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
@@ -1219,7 +1213,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1227,8 +1220,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s0, s3, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s1, s2, 16
-; GCN-HSA-NEXT:    s_and_b32 s3, s3, s4
-; GCN-HSA-NEXT:    s_and_b32 s2, s2, s4
+; GCN-HSA-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
@@ -1239,23 +1232,20 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s0, s3, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s1, s3, s8
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s3, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s2, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v4i16_to_v4i32:
@@ -1396,26 +1386,25 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s8
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s8
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, s8
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s10
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -1424,22 +1413,21 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s8, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s9, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s10, s6, 16
-; GCN-HSA-NEXT:    s_and_b32 s5, s5, s2
-; GCN-HSA-NEXT:    s_and_b32 s4, s4, s2
-; GCN-HSA-NEXT:    s_and_b32 s7, s7, s2
-; GCN-HSA-NEXT:    s_and_b32 s2, s6, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s6, 16
+; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1455,33 +1443,30 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s0, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s1, s9, s2
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s3, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, s2
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, s2
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s10, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s10, s2
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s10
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s8
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v8i16_to_v8i32:
@@ -1662,46 +1647,45 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, s12
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s14
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s12
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -1710,7 +1694,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s4, 16
@@ -1718,24 +1701,24 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; GCN-HSA-NEXT:    s_lshr_b32 s15, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s16, s9, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s17, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s18, s10, 16
-; GCN-HSA-NEXT:    s_and_b32 s5, s5, s2
-; GCN-HSA-NEXT:    s_and_b32 s4, s4, s2
-; GCN-HSA-NEXT:    s_and_b32 s7, s7, s2
-; GCN-HSA-NEXT:    s_and_b32 s6, s6, s2
-; GCN-HSA-NEXT:    s_and_b32 s9, s9, s2
-; GCN-HSA-NEXT:    s_and_b32 s8, s8, s2
-; GCN-HSA-NEXT:    s_and_b32 s11, s11, s2
-; GCN-HSA-NEXT:    s_and_b32 s2, s10, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s10, 16
+; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1766,46 +1749,43 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 ;
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx8 s[4:11], s[14:15], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s14, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s12
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s13
-; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s19, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, s14
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s10, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, s14
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s17, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, s14
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, s14
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s19, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s17, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s14
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s18
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s18
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s13, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s14
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s14
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
@@ -2097,282 +2077,277 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s27, s1, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s0, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s3, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s2, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s9, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s33, s8, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s34, s11, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s10, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s13, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s12, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s15, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s14, s18
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s1, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s0, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s3, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s37
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s33
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s24
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s22
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s18, 0xffff
+; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_and_b32 s19, s1, s18
-; GCN-HSA-NEXT:    s_and_b32 s20, s0, s18
-; GCN-HSA-NEXT:    s_and_b32 s21, s3, s18
-; GCN-HSA-NEXT:    s_and_b32 s22, s2, s18
-; GCN-HSA-NEXT:    s_and_b32 s23, s5, s18
-; GCN-HSA-NEXT:    s_and_b32 s24, s4, s18
-; GCN-HSA-NEXT:    s_and_b32 s25, s7, s18
-; GCN-HSA-NEXT:    s_and_b32 s26, s6, s18
-; GCN-HSA-NEXT:    s_and_b32 s27, s9, s18
-; GCN-HSA-NEXT:    s_and_b32 s28, s8, s18
-; GCN-HSA-NEXT:    s_and_b32 s29, s11, s18
-; GCN-HSA-NEXT:    s_and_b32 s30, s10, s18
-; GCN-HSA-NEXT:    s_and_b32 s31, s13, s18
-; GCN-HSA-NEXT:    s_and_b32 s33, s12, s18
-; GCN-HSA-NEXT:    s_and_b32 s34, s15, s18
-; GCN-HSA-NEXT:    s_and_b32 s18, s14, s18
-; GCN-HSA-NEXT:    s_lshr_b32 s35, s1, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s36, s0, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s3, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s2, s2, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s5, s5, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s4, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s7, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s9, s9, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s11, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s13, s13, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s12, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s15, s15, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
+; GCN-HSA-NEXT:    s_lshr_b32 s20, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s21, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s22, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s23, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s24, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s25, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s26, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s27, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s28, s13, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s29, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s30, s15, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s31, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s33, s17, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s34, s16, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s35, s19, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s36, s18, 16
+; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s12, s12, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s17, s17, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s16, s16, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s19, s19, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s18, s18, 0xffff
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s33
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s12
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s35
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s34
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s33
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s25
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s22
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[20:23], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s19, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s18, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[22:23], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s22, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s16, s20
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s17, s21
-; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s15, s22
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s37, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, s22
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s13, s22
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s12, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, s22
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s36
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, s22
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s10, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s35
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s34
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, s22
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s31
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s22
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s30
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s29
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s22
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s3, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s3, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s28
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s3, s3, s22
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s2, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s1, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s26
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s1, s1, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s0, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s0, s0, s22
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s22
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2884,101 +2859,100 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s52, s1, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s53, s0, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s54, s3, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s55, s2, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s56, s37, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s57, s36, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s58, s39, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s59, s38, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s60, s41, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s61, s40, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s62, s43, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s63, s42, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s64, s45, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s65, s44, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s66, s47, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s67, s46, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s68, s49, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s69, s48, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s70, s51, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s50, s20
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s37, s37, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s36, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s39, s39, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s38, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s41, s41, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s40, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s42, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s45, s45, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s44, s44, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s47, s47, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s46, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s49, s49, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s48, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s51, s51, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s50, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s43, s43, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s1, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s52, s0, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s53, s3, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s54, s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s55, s37, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s56, s36, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s57, s39, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s38, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s59, s41, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s60, s40, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s61, s43, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s62, s42, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s63, s45, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s64, s44, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s65, s47, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s66, s46, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s67, s49, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s68, s48, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s69, s51, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s70, s50, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s37, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s36, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s39, s39, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s38, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s40, s40, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s43, s43, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s42, s42, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s45, s45, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s44, s44, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s47, s47, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s46, s46, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s49, s49, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s48, s48, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s51, s51, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s50, s50, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s41, s41, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s50
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s70
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s51
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s69
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s68
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s49
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s67
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s46
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s66
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s47
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s65
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s45
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s63
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s42
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s62
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s43
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s41
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s70
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s68
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s49
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s67
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s46
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s66
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s47
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s65
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s44
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s45
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s63
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s42
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s62
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s43
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s40
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s61
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s60
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s41
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s59
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
@@ -2986,63 +2960,63 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s59
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s58
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s58
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s57
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s57
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s56
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s56
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s55
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s24
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s22
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s55
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s54
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s53
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s53
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s52
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s52
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -3050,7 +3024,6 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s53, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x10
@@ -3071,54 +3044,54 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_lshr_b32 s34, s12, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s35, s15, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s52, s14, 16
-; GCN-HSA-NEXT:    s_and_b32 s1, s1, s53
-; GCN-HSA-NEXT:    s_and_b32 s0, s0, s53
-; GCN-HSA-NEXT:    s_and_b32 s3, s3, s53
-; GCN-HSA-NEXT:    s_and_b32 s2, s2, s53
-; GCN-HSA-NEXT:    s_and_b32 s5, s5, s53
-; GCN-HSA-NEXT:    s_and_b32 s4, s4, s53
-; GCN-HSA-NEXT:    s_and_b32 s54, s7, s53
-; GCN-HSA-NEXT:    s_and_b32 s55, s6, s53
-; GCN-HSA-NEXT:    s_and_b32 s9, s9, s53
-; GCN-HSA-NEXT:    s_and_b32 s8, s8, s53
-; GCN-HSA-NEXT:    s_and_b32 s11, s11, s53
-; GCN-HSA-NEXT:    s_and_b32 s10, s10, s53
-; GCN-HSA-NEXT:    s_and_b32 s13, s13, s53
-; GCN-HSA-NEXT:    s_and_b32 s12, s12, s53
-; GCN-HSA-NEXT:    s_and_b32 s15, s15, s53
-; GCN-HSA-NEXT:    s_and_b32 s14, s14, s53
-; GCN-HSA-NEXT:    s_and_b32 s18, s37, s53
-; GCN-HSA-NEXT:    s_and_b32 s19, s36, s53
-; GCN-HSA-NEXT:    s_and_b32 s56, s39, s53
-; GCN-HSA-NEXT:    s_and_b32 s57, s38, s53
-; GCN-HSA-NEXT:    s_and_b32 s58, s41, s53
-; GCN-HSA-NEXT:    s_and_b32 s59, s40, s53
-; GCN-HSA-NEXT:    s_and_b32 s60, s43, s53
-; GCN-HSA-NEXT:    s_and_b32 s61, s42, s53
-; GCN-HSA-NEXT:    s_and_b32 s62, s45, s53
-; GCN-HSA-NEXT:    s_and_b32 s63, s44, s53
-; GCN-HSA-NEXT:    s_and_b32 s64, s47, s53
-; GCN-HSA-NEXT:    s_and_b32 s65, s46, s53
-; GCN-HSA-NEXT:    s_and_b32 s66, s49, s53
-; GCN-HSA-NEXT:    s_and_b32 s67, s48, s53
-; GCN-HSA-NEXT:    s_and_b32 s68, s51, s53
-; GCN-HSA-NEXT:    s_and_b32 s53, s50, s53
-; GCN-HSA-NEXT:    s_lshr_b32 s37, s37, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s36, s36, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s39, s39, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s38, s38, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s41, s41, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s40, s40, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s43, s43, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s42, s42, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s45, s45, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s44, s44, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s47, s47, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s46, s46, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s49, s49, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s48, s48, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s51, s51, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s50, s50, 16
+; GCN-HSA-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s53, s7, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s54, s6, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s12, s12, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-HSA-NEXT:    s_lshr_b32 s18, s37, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s19, s36, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s55, s39, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s56, s38, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s57, s41, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s58, s40, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s59, s43, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s60, s42, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s61, s45, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s62, s44, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s63, s47, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s64, s46, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s65, s49, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s66, s48, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s67, s51, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s68, s50, 16
+; GCN-HSA-NEXT:    s_and_b32 s37, s37, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s36, s36, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s39, s39, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s38, s38, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s41, s41, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s40, s40, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s43, s43, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s42, s42, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s45, s45, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s44, s44, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s47, s47, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s46, s46, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s49, s49, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s48, s48, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s51, s51, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s50, s50, 0xffff
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
@@ -3144,10 +3117,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s67
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
@@ -3157,47 +3130,47 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s61
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s42
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s59
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s68
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s65
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s47
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s63
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s45
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s59
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s47
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s40
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s41
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s38
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s19
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s58
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s56
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s19
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x50
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s37
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s14
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s37
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s18
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s52
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s15
@@ -3230,9 +3203,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s54
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s54
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s53
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s26
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -3268,160 +3241,147 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s40, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x40
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s17, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s17, s17, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s16, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s16, s16, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s19, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s19, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s18, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s18, s18, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s39, s21, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s21, s21, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s41, s20, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s20, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s23, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s23, s23, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s43, s22, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s22, s22, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s44, s25, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s25, s25, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s45, s24, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s24, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s46, s27, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s27, s27, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s47, s26, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s26, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s48, s29, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s29, s29, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s49, s28, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s28, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s50, s31, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s31, s31, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s51, s30, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s30, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s53, s1, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s55, s0, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s57, s3, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s59, s2, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s61, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s63, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s64, s9, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s65, s8, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s66, s11, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s67, s10, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s68, s13, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s69, s12, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s70, s15, s40
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s40, s14, s40
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s1, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s0, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s56, s3, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s13, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s49, s31, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s69, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s70, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s51, s1, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s52, s1, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s53, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s54, s0, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s55, s3, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s56, s3, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s57, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s58, s2, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s67, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s68, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s36
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s70
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s70
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s69
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s65, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s69
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s68
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s63, s9, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s68
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s64, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s67
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s66
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s61, s7, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s66
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s65
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s64
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s59, s5, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s63
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s31, s31, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s63
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s62
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s62
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s50, s30, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s61
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s60
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s59
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s58
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s56
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s30, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s57
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s54
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s52
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s47, s29, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s52
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s29, s29, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s48, s28, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s28, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s45, s27, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s51
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s50
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s50
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s27, s27, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s46, s26, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s26, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s43, s25, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s49
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s48
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s29
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s47
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s25, s25, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s44, s24, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s24, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s41, s23, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s47
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s46
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s46
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s23, s23, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s22, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s22, s22, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s39, s21, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s45
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s44
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s25
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s44
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s21, s21, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s40, s20, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s20, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s19, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s43
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s42
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s41
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s19, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s18, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s18, s18, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s17, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s41
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s40
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s17, s17, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s16, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
@@ -4847,52 +4807,48 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s0, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s1, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s3, s3, s8
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s2, s2, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_lshr_b32 s5, s3, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s6, s2, 16
-; GCN-HSA-NEXT:    s_and_b32 s7, s2, s4
-; GCN-HSA-NEXT:    s_and_b32 s2, s3, s4
+; GCN-HSA-NEXT:    s_lshr_b32 s4, s3, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s5, s2, 16
+; GCN-HSA-NEXT:    s_and_b32 s6, s2, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -4900,19 +4856,18 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-NOHSA-VI:       ; %bb.0:
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s0, s2, s8
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s1, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s3, s8
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s3, s3, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -5090,39 +5045,36 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(
 define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s4, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s5, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, s6
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, s6
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s6
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s9, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -5133,19 +5085,18 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s8, s5, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s9, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s10, s4, 16
-; GCN-HSA-NEXT:    s_and_b32 s4, s4, s2
-; GCN-HSA-NEXT:    s_and_b32 s6, s6, s2
-; GCN-HSA-NEXT:    s_and_b32 s5, s5, s2
-; GCN-HSA-NEXT:    s_and_b32 s2, s7, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s3, s7, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
@@ -5173,39 +5124,35 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ;
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s8, s6
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s9, s6
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s10, s6
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s11, s6
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s6, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -5464,39 +5411,36 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(
 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[4:11], s[14:15], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s12
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s13
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s14
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, s14
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, s14
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, s14
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s14
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s14
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, s14
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s14
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
@@ -5508,19 +5452,19 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -5531,27 +5475,26 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s2, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s7, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s14, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s9, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s15, s8, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s16, s10, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s17, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s18, s4, 16
-; GCN-HSA-NEXT:    s_and_b32 s4, s4, s2
-; GCN-HSA-NEXT:    s_and_b32 s6, s6, s2
-; GCN-HSA-NEXT:    s_and_b32 s10, s10, s2
-; GCN-HSA-NEXT:    s_and_b32 s8, s8, s2
-; GCN-HSA-NEXT:    s_and_b32 s5, s5, s2
-; GCN-HSA-NEXT:    s_and_b32 s7, s7, s2
-; GCN-HSA-NEXT:    s_and_b32 s11, s11, s2
-; GCN-HSA-NEXT:    s_and_b32 s2, s9, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s3, s9, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
@@ -5607,50 +5550,47 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa
 ;
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx8 s[4:11], s[14:15], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s14, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s12
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s13
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s4, s14
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s5, s14
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s6, s14
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s16, s7, s14
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s17, s8, s14
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s18, s9, s14
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s10, s14
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s11, s14
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s11, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s18, s10, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s17, s9, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s16, s8, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s17
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s6, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s13
@@ -6109,353 +6049,346 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s27, s0, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s2, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s4, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s6, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s8, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s33, s10, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s34, s12, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s14, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s1, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s3, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, s18
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, s18
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s4, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s17, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s19, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s18, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s16, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s37
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s29
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s33
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s30
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s27
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s36
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s18, 0xffff
+; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_and_b32 s19, s0, s18
-; GCN-HSA-NEXT:    s_and_b32 s20, s2, s18
-; GCN-HSA-NEXT:    s_and_b32 s21, s4, s18
-; GCN-HSA-NEXT:    s_and_b32 s22, s6, s18
-; GCN-HSA-NEXT:    s_and_b32 s23, s8, s18
-; GCN-HSA-NEXT:    s_and_b32 s24, s10, s18
-; GCN-HSA-NEXT:    s_and_b32 s25, s12, s18
-; GCN-HSA-NEXT:    s_and_b32 s26, s14, s18
-; GCN-HSA-NEXT:    s_and_b32 s27, s1, s18
-; GCN-HSA-NEXT:    s_and_b32 s28, s3, s18
-; GCN-HSA-NEXT:    s_and_b32 s29, s5, s18
-; GCN-HSA-NEXT:    s_and_b32 s30, s7, s18
-; GCN-HSA-NEXT:    s_and_b32 s31, s9, s18
-; GCN-HSA-NEXT:    s_and_b32 s33, s11, s18
-; GCN-HSA-NEXT:    s_and_b32 s34, s13, s18
-; GCN-HSA-NEXT:    s_and_b32 s18, s15, s18
-; GCN-HSA-NEXT:    s_lshr_b32 s35, s1, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s3, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s5, s5, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s7, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s9, s9, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s11, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s13, s13, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s15, s15, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s12, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s4, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s2, s2, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s36, s0, 16
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s20, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s21, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s22, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s23, s13, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s24, s15, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s25, s17, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s26, s19, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s27, s18, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s28, s16, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s29, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s30, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s31, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s33, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s34, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s4, 16
+; GCN-HSA-NEXT:    s_and_b32 s35, s4, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s12, s12, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s16, s16, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s18, s18, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s36, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s17, s17, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s19, s19, 0xffff
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xd0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s4
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x90
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s25
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s33
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s24
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 48
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s35
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xc0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xc0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s28
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xa0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xa0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s25
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s29
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x80
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x80
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s30
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x60
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 64
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s33
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s35
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s19
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[20:23], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s19, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s18, -1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[22:23], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s22, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s16, s20
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s17, s21
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s19, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s18, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s0, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s21, s1, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s23, s2, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s3, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s25, s4, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s5, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s27, s6, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s7, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s29, s8, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s9, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s31, s10, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s33, s11, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s34, s12, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s13, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s36, s14, s22
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s22, s15, s22
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s36, s15, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s14, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s34, s13, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s13, s13, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s33, s12, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s35
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s35
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s31, s11, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s34
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s10, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s33
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s33
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s29, s9, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s31
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s8, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s27, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s29
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s6, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s25, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s27
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s27
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s3, s3, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s23, s3, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s25
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s3, s3, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s25
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s22, s2, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s2, s2, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s1, s1, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s21, s1, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s23
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s1, s1, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s23
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s0, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s0, s0, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s21

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 29c8d6a8b21d0..6c4071354cc82 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3632,24 +3632,21 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_movk_i32 s12, 0x50
-; GCN-HSA-NEXT:    s_movk_i32 s13, 0x60
-; GCN-HSA-NEXT:    s_movk_i32 s14, 0x70
-; GCN-HSA-NEXT:    s_mov_b32 s15, 0xffff
+; GCN-HSA-NEXT:    s_mov_b32 s12, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s12
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s13
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s14
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s4
@@ -3657,12 +3654,12 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    s_add_u32 s8, s2, 48
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 64
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
@@ -3679,8 +3676,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, s15, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v24, s15, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, s12, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, s12, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[0:1]
@@ -3698,16 +3695,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xa0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, s15, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v24, s15, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, s12, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, s12, v2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
 ; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
@@ -3717,62 +3714,62 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v8
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v14
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v13
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s15, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, s12, v14
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s15, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s12, v15
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s13
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v19
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v18
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v19
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v18
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s14
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
@@ -3780,47 +3777,47 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s12
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v33
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, s15, v33
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, s15, v32
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, s12, v33
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, s12, v32
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v21
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v20
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v20
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s15, v23
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s15, v22
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s12, v23
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, s12, v22
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v35
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, s15, v35
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s15, v34
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, s12, v35
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, s12, v34
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v29
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v28
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s15, v29
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s15, v28
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s12, v29
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, s12, v28
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s15, v31
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s15, v30
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s12, v31
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s12, v30
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -4415,44 +4412,41 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_movk_i32 s7, 0x70
-; GCN-HSA-NEXT:    s_movk_i32 s8, 0x60
-; GCN-HSA-NEXT:    s_movk_i32 s6, 0x50
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s7
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s8
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s6
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 64
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s4
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    s_add_u32 s10, s2, 32
-; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
+; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
+; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
@@ -4505,37 +4499,37 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v5
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v5, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v4, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v6
 ; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[7:10]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
 ; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v17
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v16
@@ -4545,7 +4539,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s8
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v19, 0, 16
@@ -4554,7 +4548,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s7
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
@@ -4567,7 +4561,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v23, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v22, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s6
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v33
@@ -6590,8 +6584,8 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
@@ -6604,8 +6598,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
 ; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v1
@@ -6613,50 +6606,53 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v5
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
 ; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s3
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, s6, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, s6, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[7:10]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[17:18], v[11:14]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, s6, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s6, v4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, s6, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 0x60
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, s6, v3
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[1:2], v[7:10]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, s6, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GCN-HSA-NEXT:    v_and_b32_e32 v0, s6, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[17:18], v[11:14]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64:
@@ -7523,70 +7519,74 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s10
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, s18, v16
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[5:8]
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
 ; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
-; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xc0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
 ; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v19
-; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xa0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
 ; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v17
-; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[5:8]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
 ; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0x80
-; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s18, v9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s6
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[5:8]
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, s18, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[5:8]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, s18, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, s18, v16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, s18, v15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, s18, v9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[17:20]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, s18, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v10
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, s18, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[17:20]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 40264342c2950..6af32a90d8527 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -41,7 +41,6 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
 
 ; GCN-LABEL: {{^}}madak_2_use_f32:
 ; GFX9:         v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX10:        v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GFX6-DAG:     buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GFX6-DAG:     buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GFX6-DAG:     buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
@@ -54,7 +53,8 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
 ; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 ; FMA-DAG:      v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 ; MAD-DAG:      v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
-; FMA-DAG:      v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
+; GFX10-FMA-DAG:v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000
+; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
 ; GCN:          s_endpgm
 define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

diff  --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index deded84145b92..70c0287590490 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -228,7 +228,9 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %ou
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
+; SI-DAG: s_and_b32 [[A16:s[0-9]+]], [[A]], 0xffff
+; SI-DAG: s_and_b32 [[B16:s[0-9]+]], [[B]], 0xffff
+; SI: s_max_u32 [[MAX:s[0-9]+]], [[A16]], [[B16]]
 ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
 ; SI: buffer_store_dword [[VMAX]]
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 0927be6a6b7ae..def945fc3b4a4 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -73,7 +73,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 ad
 ; FUNC-LABEL: {{^}}mul64_sext_c:
 ; EG-DAG: MULLO_INT
 ; EG-DAG: MULHI_INT
-; SI-DAG: s_mul_i32
+; SI-DAG: s_mulk_i32
 ; SI-DAG: v_mul_hi_i32
 ; VI: v_mad_i64_i32
 define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {

diff  --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 9e3ef2434ee50..f625977b6854d 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -12,11 +12,10 @@ define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, 0xffffff
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, s2
-; SI-NEXT:    s_and_b32 s2, s5, s2
-; SI-NEXT:    s_mul_i32 s4, s4, s2
+; SI-NEXT:    s_and_b32 s2, s4, 0xffffff
+; SI-NEXT:    s_and_b32 s4, s5, 0xffffff
+; SI-NEXT:    s_mul_i32 s4, s2, s4
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -26,12 +25,11 @@ define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s6, 0xffffff
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s4, s6
-; VI-NEXT:    s_and_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s4, s4, 0xffffff
+; VI-NEXT:    s_and_b32 s5, s5, 0xffffff
 ; VI-NEXT:    s_mul_i32 s4, s4, s5
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -41,13 +39,12 @@ define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0xffffff
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_and_b32 s0, s3, s0
-; GFX9-NEXT:    s_mul_i32 s0, s1, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
@@ -405,13 +402,12 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0xffffff
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_and_b32 s0, s3, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
@@ -466,12 +462,11 @@ define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_and_b32 s5, s6, s4
+; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, s7, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@@ -494,12 +489,11 @@ define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %
 ; SI-NEXT:    s_load_dword s7, s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_mov_b32 s8, 0xffffff
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_and_b32 s4, s6, s8
+; SI-NEXT:    s_and_b32 s4, s6, 0xffffff
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s5, s7, s8
+; SI-NEXT:    s_and_b32 s5, s7, 0xffffff
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    s_mul_i32 s4, s4, s5
 ; SI-NEXT:    v_mul_hi_u32_u24_e32 v1, s6, v0
@@ -531,14 +525,13 @@ define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_and_b32 s5, s6, s4
+; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, s7, s4
-; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s4
-; GFX9-NEXT:    s_mul_i32 s5, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@@ -638,12 +631,11 @@ define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, 0xffff
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, s2
-; SI-NEXT:    s_and_b32 s2, s5, s2
-; SI-NEXT:    s_mul_i32 s4, s4, s2
-; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    s_and_b32 s2, s4, 0xffff
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_mul_i32 s2, s2, s4
+; SI-NEXT:    s_lshr_b32 s4, s2, 16
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -653,12 +645,11 @@ define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s6, 0xffff
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s4, s6
-; VI-NEXT:    s_and_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; VI-NEXT:    s_mul_i32 s4, s4, s5
 ; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
@@ -669,12 +660,11 @@ define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_and_b32 s0, s3, s0
-; GFX9-NEXT:    s_mul_i32 s0, s1, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffff
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
@@ -696,15 +686,14 @@ define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s1, 0xffffff
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s3, s2, s1
-; SI-NEXT:    s_and_b32 s1, s0, s1
+; SI-NEXT:    s_and_b32 s1, s2, 0xffffff
+; SI-NEXT:    s_and_b32 s3, s0, 0xffffff
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mul_hi_u32_u24_e32 v0, s2, v0
-; SI-NEXT:    s_mul_i32 s3, s3, s1
+; SI-NEXT:    s_mul_i32 s1, s1, s3
 ; SI-NEXT:    v_and_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -725,17 +714,16 @@ define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %
 ;
 ; GFX9-LABEL: test_umul24_i33:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s0, 0xffffff
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_and_b32 s0, s3, s0
-; GFX9-NEXT:    s_mul_i32 s2, s1, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT:    s_mul_i32 s2, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
@@ -783,16 +771,15 @@ define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33
 ;
 ; GFX9-LABEL: test_umulhi24_i33:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s0, 0xffffff
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_and_b32 s0, s3, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 118cfe515ec1c..c2e615e971323 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -80,8 +80,8 @@ define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i3
 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
 ; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
 
-; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
-; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
+; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039
+; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b
 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 10e8d96ed8227..b66b9cc355163 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -565,8 +565,7 @@ define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) {
 }
 
 ; GCN-LABEL: {{^}}fneg_v2f32_scalar:
-; GCN:         s_brev_b32 [[SIGN:s[0-9]+]], 1
-; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGN]]
+; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) {
   %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
   store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index ca0e32fac5bea..d958f5d9d97d6 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -1715,22 +1715,28 @@ define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s39
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s38, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x1000, v0
+; GFX8-NEXT:    s_movk_i32 s0, 0x1000
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x1800, v0
+; GFX8-NEXT:    s_movk_i32 s0, 0x1800
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x2000, v0
+; GFX8-NEXT:    s_movk_i32 s0, 0x2000
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s0, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x2800, v12
+; GFX8-NEXT:    s_movk_i32 s0, 0x2800
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s0, v12
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
 ; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x3000, v12
+; GFX8-NEXT:    s_movk_i32 s0, 0x3000
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s0, v12
 ; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v13, vcc
+; GFX8-NEXT:    s_movk_i32 s0, 0x3800
 ; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
 ; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0x3800, v12
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s0, v12
 ; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
@@ -1780,16 +1786,14 @@ define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
 ; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x1000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x2000
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x2000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
 ; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2000, v12
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x3000, v12
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v13, vcc
 ; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:2048
 ; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[4:5], off

diff  --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
index 20943ffa05f10..821576a550c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -16,11 +16,9 @@ define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   ret void
 }
 
-; FIXME: This should be folded with any number of uses.
 ; SI-LABEL: {{^}}s_addk_i32_k0_x2:
-; SI: s_movk_i32 [[K:s[0-9]+]], 0x41
-; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
-; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
 ; SI: s_endpgm
 define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
   %add0 = add i32 %a, 65

diff  --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index 2c1aa053b39c4..c5f8abc08f00d 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -483,10 +483,9 @@ bb4:
 }
 
 ; GCN-LABEL: {{^}}phi_imm_in_sgprs
-; GCN: s_movk_i32 [[A:s[0-9]+]], 0x400
 ; GCN: s_movk_i32 [[B:s[0-9]+]], 0x400
 ; GCN: [[LOOP_LABEL:.L[0-9a-zA-Z_]+]]:
-; GCN: s_xor_b32 [[B]], [[B]], [[A]]
+; GCN: s_xor_b32 [[B]], [[B]], 0x400
 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
 define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
index d1903f457b306..4a44a89a016f7 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -48,9 +48,8 @@ done:
 
 ; GCN-LABEL: {{^}}legal_offset_fi_offset:
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}}
-; This constant isn't folded, because it has multiple uses.
 ; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004
-; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]]
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8004
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}}
 
 define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index daa60acbd60f9..6fec9f648034a 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -21,79 +21,78 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s12, s3, 31
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    s_add_u32 s2, s2, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
 ; GCN-NEXT:    s_addc_u32 s3, s3, s12
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s5, v0
 ; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, s3, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, s3, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
@@ -186,12 +185,11 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s16
-; GCN-IR-NEXT:    s_add_u32 s20, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s21, s7, -1
+; GCN-IR-NEXT:    s_add_u32 s19, s6, -1
+; GCN-IR-NEXT:    s_addc_u32 s20, s7, -1
 ; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
 ; GCN-IR-NEXT:    s_add_u32 s12, s8, s18
-; GCN-IR-NEXT:    s_mov_b32 s19, s15
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s15
+; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
@@ -201,8 +199,8 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[14:15], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s20, s16
-; GCN-IR-NEXT:    s_subb_u32 s8, s21, s17
+; GCN-IR-NEXT:    s_sub_u32 s8, s19, s16
+; GCN-IR-NEXT:    s_subb_u32 s8, s20, s17
 ; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
 ; GCN-IR-NEXT:    s_mov_b32 s15, s14
 ; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
@@ -211,9 +209,9 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_subb_u32 s17, s17, s15
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[12:13], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[22:23]
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
 ; GCN-IR-NEXT:  .LBB0_4: ; %Flow6
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
@@ -257,7 +255,6 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GCN-NEXT:    v_rcp_f32_e32 v5, v5
-; GCN-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GCN-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v6, v6
@@ -273,7 +270,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v11, v5, v9
 ; GCN-NEXT:    v_mul_hi_u32 v12, v5, v10
 ; GCN-NEXT:    v_mul_hi_u32 v13, v5, v9
-; GCN-NEXT:    v_mul_hi_u32 v15, v6, v9
+; GCN-NEXT:    v_mul_hi_u32 v14, v6, v9
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v9
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; GCN-NEXT:    v_addc_u32_e32 v12, vcc, 0, v13, vcc
@@ -281,7 +278,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v10, v6, v10
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v12, v10, vcc
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v15, v14, vcc
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v14, vcc
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
@@ -303,7 +300,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v8, v6, v8
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v10, vcc
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -324,7 +321,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v14, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v8, v2, v6
@@ -398,12 +395,10 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[7:8]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v10, v12, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v16, v17
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v11, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
@@ -423,10 +418,10 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v19, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v0, v0
 ; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[11:12], v14
-; GCN-IR-NEXT:    v_not_b32_e32 v9, v17
+; GCN-IR-NEXT:    v_not_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, v0, v13
-; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, v9, v16, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
@@ -1034,12 +1029,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s16
-; GCN-IR-NEXT:    s_add_u32 s20, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s21, s7, -1
+; GCN-IR-NEXT:    s_add_u32 s19, s6, -1
+; GCN-IR-NEXT:    s_addc_u32 s20, s7, -1
 ; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
 ; GCN-IR-NEXT:    s_add_u32 s12, s8, s18
-; GCN-IR-NEXT:    s_mov_b32 s19, s15
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s15
+; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
@@ -1049,8 +1043,8 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[14:15], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s20, s16
-; GCN-IR-NEXT:    s_subb_u32 s8, s21, s17
+; GCN-IR-NEXT:    s_sub_u32 s8, s19, s16
+; GCN-IR-NEXT:    s_subb_u32 s8, s20, s17
 ; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
 ; GCN-IR-NEXT:    s_mov_b32 s15, s14
 ; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
@@ -1059,9 +1053,9 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_subb_u32 s17, s17, s15
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[12:13], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[22:23]
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
 ; GCN-IR-NEXT:  .LBB9_4: ; %Flow3
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
@@ -1111,58 +1105,57 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
@@ -1310,7 +1303,6 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1326,7 +1318,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v9, v3, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
 ; GCN-NEXT:    v_mul_hi_u32 v11, v3, v7
-; GCN-NEXT:    v_mul_hi_u32 v13, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v11, vcc
@@ -1334,7 +1326,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v8
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v8, vcc
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v12, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
@@ -1356,7 +1348,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v8, vcc
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v12, vcc
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
@@ -1417,25 +1409,24 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v8
-; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v5, vcc, s6, v8
+; GCN-IR-NEXT:    v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[5:6]
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, 24, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, 24, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v9
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB11_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
@@ -1445,10 +1436,10 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v10
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v9
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 58, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
@@ -1481,12 +1472,12 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  .LBB11_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v6, v0
 ; GCN-IR-NEXT:  .LBB11_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v6, v2
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v7, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v7, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1509,7 +1500,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1525,7 +1515,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v9, v3, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
 ; GCN-NEXT:    v_mul_hi_u32 v11, v3, v7
-; GCN-NEXT:    v_mul_hi_u32 v13, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v11, vcc
@@ -1533,7 +1523,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v8
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v8, vcc
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v12, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
@@ -1555,7 +1545,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v8, vcc
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v12, vcc
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
@@ -1612,27 +1602,26 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v8
-; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v5, vcc, s6, v8
+; GCN-IR-NEXT:    v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, s8
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[5:6]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[8:9], v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
@@ -1643,10 +1632,10 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v10
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v9
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
@@ -1679,12 +1668,12 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  .LBB12_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v6, v0
 ; GCN-IR-NEXT:  .LBB12_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v6, v2
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v7, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v7, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index aa97bb0b76bd3..ada9fe811185e 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -516,13 +516,15 @@ store_label:
 ; NOSDWA-NOT: v_or_b32_sdwa
 
 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ;
 ; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ;
 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ;
 ; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index 02535610e0277..73d72f44a1c06 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -156,12 +156,12 @@ define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a,
 ; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
 ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff
-; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]]
-; SI: s_cmp_lg_u32 [[B]], [[K255]]
+; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff
+; SI: s_cmpk_lg_i32 [[B]], 0xff
 ; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0
 
-; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
+; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], 0xff
+; VI: s_movk_i32 [[K255:s[0-9]+]], 0xff
 ; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]]
 ; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]]
 
@@ -208,9 +208,8 @@ define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind {
 ; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg:
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff
-; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]]
-; GCN: s_cmp_lg_u32 [[B]], [[K]]{{$}}
+; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], 0xff
+; GCN: s_cmpk_lg_i32 [[B]], 0xff{{$}}
 ; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_byte [[RESULT]]

diff  --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 78ad2625a673a..f914c9645df90 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -149,9 +149,9 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0x41
 ; GCN-NEXT:    v_lshr_b64 v[1:2], s[4:5], v0
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x41
 ; GCN-NEXT:    s_and_b64 vcc, s[4:5], vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v1, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 08b045e4a623f..fb15854e1fb97 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -21,17 +21,16 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s6, 0xffff
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s7, s4, s6
+; VI-NEXT:    s_and_b32 s6, s4, 0xffff
 ; VI-NEXT:    s_lshr_b32 s4, s4, 16
-; VI-NEXT:    s_lshr_b32 s8, s5, 16
-; VI-NEXT:    s_lshl_b32 s4, s4, s8
-; VI-NEXT:    s_lshl_b32 s5, s7, s5
+; VI-NEXT:    s_lshr_b32 s7, s5, 16
+; VI-NEXT:    s_lshl_b32 s4, s4, s7
+; VI-NEXT:    s_lshl_b32 s5, s6, s5
 ; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_and_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 5326836b7912b..edbc1fcd5842e 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1822,18 +1822,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)*
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0xffffc400
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    s_movk_i32 s2, 0xc400
-; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, s2, v3
+; VI-NEXT:    v_add_u16_e32 v2, 0xc400, v3
 ; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -1895,18 +1894,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    s_movk_i32 s2, 0x4400
-; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, s2, v3
+; VI-NEXT:    v_add_u16_e32 v2, 0x4400, v3
 ; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -1968,18 +1966,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)*
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    s_movk_i32 s2, 0x4000
-; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, s2, v3
+; VI-NEXT:    v_add_u16_e32 v2, 0x4000, v3
 ; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -2041,18 +2038,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0xffffc000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    s_movk_i32 s2, 0xc000
-; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, s2, v3
+; VI-NEXT:    v_add_u16_e32 v2, 0xc000, v3
 ; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 29c20c69c94f1..30f021d87e53c 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -31,14 +31,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8
   ; CHECK-NEXT:   undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (load (s64) from %ir.40, addrspace 4)
-  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (load (s128) from %ir.84, addrspace 4)
@@ -195,7 +194,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (load (s128) from %ir.230, addrspace 4)
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (load (s128) from %ir.236, addrspace 4)
-  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, [[S_MOV_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
   ; CHECK-NEXT:   [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32))
@@ -211,7 +210,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (load (s64) from %ir.320, addrspace 4)
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, [[S_MOV_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
   ; CHECK-NEXT:   [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
@@ -231,7 +230,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4)
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
-  ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], [[S_MOV_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY %71
   ; CHECK-NEXT:   [[COPY16]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
   ; CHECK-NEXT:   [[COPY16]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index b227e29b2e389..1b486681037ce 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -17,72 +17,71 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s0, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s1, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, s11, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, s11, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s10, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
@@ -161,9 +160,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
 ; GCN-IR-NEXT:    s_not_b64 s[6:7], s[10:11]
-; GCN-IR-NEXT:    s_mov_b32 s15, s11
 ; GCN-IR-NEXT:    s_add_u32 s10, s6, s14
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, s11
+; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
@@ -234,7 +232,6 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
-; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
@@ -250,7 +247,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v11, v4, v9
 ; GCN-NEXT:    v_mul_hi_u32 v12, v4, v8
-; GCN-NEXT:    v_mul_hi_u32 v14, v5, v8
+; GCN-NEXT:    v_mul_hi_u32 v13, v5, v8
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v8
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
@@ -258,7 +255,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v14, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
@@ -280,7 +277,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v7, v5, v7
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
@@ -301,7 +298,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v5, v1, v5
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v10, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
@@ -354,101 +351,99 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v3, v3, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v5, vcc, v2, v6
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v3, v6, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[5:6]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v2
+; GCN-IR-NEXT:    v_add_i32_e64 v6, s[6:7], 32, v6
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v3
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v6, v7
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v0
+; GCN-IR-NEXT:    v_add_i32_e64 v6, s[6:7], 32, v6
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v1
+; GCN-IR-NEXT:    v_min_u32_e32 v11, v6, v7
+; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[6:7], v10, v11
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v5
-; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v6
-; GCN-IR-NEXT:    v_min_u32_e32 v3, v3, v7
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v8, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v12, v7, v8
-; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v3, v12
-; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[7:8]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[7:8]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v11
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v10, v1, 0, s[6:7]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v0, 0, s[6:7]
+; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[6:7], 0, 0, s[6:7]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[7:8]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[7:8]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v1, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v7
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], 63, v7
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[0:1], v7
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v7
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v8, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], 63, v7
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[0:1], v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v18, vcc, -1, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v19, vcc, -1, v6, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v3, v3
-; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[0:1], v14
-; GCN-IR-NEXT:    v_not_b32_e32 v9, v11
-; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, v3, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, v9, v13, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v9, v10
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
+; GCN-IR-NEXT:    v_not_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v9, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v3, 31, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v14, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v18, v3
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, v19, v15, vcc
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
-; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, 1, v11
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v16, v7
-; GCN-IR-NEXT:    v_and_b32_e32 v9, 1, v13
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v13, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v13, v13, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, 0, v12, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[11:12]
-; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v3, v13
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v17, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v17, v10
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v16, v12
+; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v17, v13, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v14, v6
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v10
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v15, v7
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v14
+; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v15, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v9
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v16, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v8
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v9, v9, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v8, v6
 ; GCN-IR-NEXT:  .LBB1_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, v5, v10
-; GCN-IR-NEXT:    v_mul_hi_u32 v7, v5, v9
-; GCN-IR-NEXT:    v_mul_lo_u32 v6, v6, v9
-; GCN-IR-NEXT:    v_mul_lo_u32 v5, v5, v9
+; GCN-IR-NEXT:    v_mul_lo_u32 v7, v2, v9
+; GCN-IR-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v3, v6
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, v6
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %x, %y
   ret i64 %result
@@ -862,29 +857,28 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem33_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[10:11], 31
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[0:1], 31
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 31
 ; GCN-NEXT:    s_ashr_i32 s0, s1, 31
-; GCN-NEXT:    s_add_u32 s4, s4, s0
+; GCN-NEXT:    s_add_u32 s8, s8, s0
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_addc_u32 s5, s5, s0
-; GCN-NEXT:    s_xor_b64 s[12:13], s[4:5], s[0:1]
+; GCN-NEXT:    s_addc_u32 s9, s9, s0
+; GCN-NEXT:    s_xor_b64 s[12:13], s[8:9], s[0:1]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GCN-NEXT:    s_sub_u32 s0, 0, s12
 ; GCN-NEXT:    s_subb_u32 s1, 0, s13
-; GCN-NEXT:    s_ashr_i32 s10, s11, 31
+; GCN-NEXT:    s_ashr_i32 s6, s7, 31
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    s_mov_b32 s8, s4
+; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
@@ -899,16 +893,16 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v7, v1, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -919,25 +913,25 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT:    s_add_u32 s0, s2, s10
+; GCN-NEXT:    s_add_u32 s0, s2, s6
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    s_addc_u32 s1, s3, s10
+; GCN-NEXT:    s_addc_u32 s1, s3, s6
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    s_xor_b64 s[14:15], s[0:1], s[10:11]
+; GCN-NEXT:    s_xor_b64 s[14:15], s[0:1], s[6:7]
 ; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
 ; GCN-NEXT:    v_mul_hi_u32 v4, s14, v1
@@ -949,7 +943,7 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mul_hi_u32 v0, s15, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
@@ -987,12 +981,12 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s10, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s10, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem33_64:
@@ -1047,9 +1041,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_add_u32 s18, s8, -1
 ; GCN-IR-NEXT:    s_addc_u32 s19, s9, -1
 ; GCN-IR-NEXT:    s_not_b64 s[6:7], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
 ; GCN-IR-NEXT:    s_add_u32 s12, s6, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, s13
+; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  .LBB8_3: ; %udiv-do-while
@@ -1205,9 +1198,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
 ; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
 ; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
 ; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s13
+; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
@@ -1288,58 +1280,57 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
@@ -1483,7 +1474,6 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
-; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1499,7 +1489,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v8, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v11, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
@@ -1507,7 +1497,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v12, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
@@ -1529,7 +1519,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1585,24 +1575,23 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v6
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, 24, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, 24, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v7
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB11_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], 24, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1612,10 +1601,10 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v8
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1647,17 +1636,17 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB11_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v6
 ; GCN-IR-NEXT:  .LBB11_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v5
-; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v3
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1680,7 +1669,6 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
-; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1696,7 +1684,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v8, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v11, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
@@ -1704,7 +1692,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v12, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
@@ -1726,7 +1714,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1778,26 +1766,25 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
+; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1808,10 +1795,10 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1843,15 +1830,15 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB12_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v4, v6
 ; GCN-IR-NEXT:  .LBB12_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v5
-; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v5
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v5
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v5
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
index bece8aab1e3d2..940071149e132 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
@@ -186,13 +186,12 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX10-NEXT:    v_add_f16_e32 v5, v0, v2
-; GFX10-NEXT:    v_add_f16_e32 v6, v1, v3
+; GFX10-NEXT:    v_add_f16_e32 v4, v0, v2
+; GFX10-NEXT:    v_add_f16_e32 v5, v1, v3
 ; GFX10-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, v4, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
index 4326ecd534622..3831190a538d0 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
@@ -132,14 +132,13 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
 ; GFX10-NEXT:    v_fmac_f16_e32 v4, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_fmac_f16_e32 v5, v1, v3
 ; GFX10-NEXT:    v_fmac_f16_e32 v6, v8, v7
 ; GFX10-NEXT:    v_fmac_f16_e32 v9, v11, v10
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v0, v5
-; GFX10-NEXT:    v_lshl_or_b32 v0, v9, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v5
+; GFX10-NEXT:    v_lshl_or_b32 v0, v9, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x half> %val

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
index 00aa96fea128a..3dcdc69704f34 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
@@ -186,13 +186,12 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX10-NEXT:    v_mul_f16_e32 v5, v0, v2
-; GFX10-NEXT:    v_mul_f16_e32 v6, v1, v3
+; GFX10-NEXT:    v_mul_f16_e32 v4, v0, v2
+; GFX10-NEXT:    v_mul_f16_e32 v5, v1, v3
 ; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, v4, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 802c08fe64a26..3d4a879d9ee36 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -206,13 +206,12 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX10-NEXT:    v_sub_f16_e32 v5, v0, v2
-; GFX10-NEXT:    v_sub_f16_e32 v6, v1, v3
+; GFX10-NEXT:    v_sub_f16_e32 v4, v0, v2
+; GFX10-NEXT:    v_sub_f16_e32 v5, v1, v3
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, v4, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 4c5e61ca626ce..3c74feb7ea441 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -626,7 +626,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v2, v1, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, v3, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GFX10-NEXT:    v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 54dfc8a7f353e..b4ee382704a78 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -33,11 +33,10 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX10-NEXT:    v_min_u16 v0, v0, s4
+; GFX10-NEXT:    v_min_u16 v0, 0xff, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 2a3100979883e..79c7c8b3b6d81 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -500,7 +500,6 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    s_mov_b32 s0, 0x4f7ffffe
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
@@ -510,8 +509,8 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GFX1030-NEXT:    v_sub_nc_u32_e32 v8, 0, v3
 ; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX1030-NEXT:    v_mul_f32_e32 v5, s0, v5
-; GFX1030-NEXT:    v_mul_f32_e32 v6, s0, v6
+; GFX1030-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GFX1030-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX1030-NEXT:    v_mul_lo_u32 v7, v7, v5
@@ -893,7 +892,6 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1030-NEXT:    s_mov_b32 s0, 0x4f7ffffe
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    s_clause 0x1
 ; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
@@ -911,10 +909,10 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, 0, v1
 ; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, 0, v2
 ; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, 0, v3
-; GFX1030-NEXT:    v_mul_f32_e32 v9, s0, v9
-; GFX1030-NEXT:    v_mul_f32_e32 v10, s0, v10
-; GFX1030-NEXT:    v_mul_f32_e32 v11, s0, v11
-; GFX1030-NEXT:    v_mul_f32_e32 v12, s0, v12
+; GFX1030-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; GFX1030-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; GFX1030-NEXT:    v_mul_f32_e32 v11, 0x4f7ffffe, v11
+; GFX1030-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v11, v11
@@ -2154,16 +2152,15 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocaptu
 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
-; GFX1030-NEXT:    s_mov_b32 s0, 0x1389c755
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; GFX1030-NEXT:    v_mul_hi_u32 v0, v0, s0
-; GFX1030-NEXT:    v_mul_hi_u32 v1, v1, s0
-; GFX1030-NEXT:    v_mul_hi_u32 v2, v2, s0
-; GFX1030-NEXT:    v_mul_hi_u32 v3, v3, s0
+; GFX1030-NEXT:    v_mul_hi_u32 v0, 0x1389c755, v0
+; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x1389c755, v1
+; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x1389c755, v2
+; GFX1030-NEXT:    v_mul_hi_u32 v3, 0x1389c755, v3
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
@@ -2499,7 +2496,6 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
 ; SI-NEXT:    v_rcp_f32_e32 v2, v2
 ; SI-NEXT:    s_mov_b32 s4, 0xfffe7960
-; SI-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; SI-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2513,16 +2509,16 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; SI-NEXT:    v_mul_hi_u32 v7, v2, v5
 ; SI-NEXT:    v_mul_lo_u32 v6, v2, v4
-; SI-NEXT:    v_mul_hi_u32 v9, v2, v4
-; SI-NEXT:    v_mul_hi_u32 v10, v3, v4
+; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
+; SI-NEXT:    v_mul_hi_u32 v9, v3, v4
 ; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; SI-NEXT:    v_mul_lo_u32 v9, v3, v5
+; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; SI-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; SI-NEXT:    v_mul_hi_u32 v5, v3, v5
-; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -2535,16 +2531,16 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
 ; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
-; SI-NEXT:    v_mul_hi_u32 v9, v2, v4
-; SI-NEXT:    v_mul_hi_u32 v10, v3, v4
+; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
+; SI-NEXT:    v_mul_hi_u32 v9, v3, v4
 ; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; SI-NEXT:    v_mul_lo_u32 v9, v3, v6
+; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; SI-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; SI-NEXT:    v_mul_hi_u32 v6, v3, v6
-; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
-; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -2560,18 +2556,18 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; SI-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
-; SI-NEXT:    v_addc_u32_e32 v4, vcc, v7, v8, vcc
+; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
 ; SI-NEXT:    v_mul_hi_u32 v5, v2, s4
 ; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
+; SI-NEXT:    s_mov_b32 s4, 0x1869f
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; SI-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v0
+; SI-NEXT:    v_subrev_i32_e32 v4, vcc, 0x186a0, v0
 ; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
-; SI-NEXT:    s_mov_b32 s4, 0x1869f
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v4
 ; SI-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2599,7 +2595,6 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
 ; VI-NEXT:    v_rcp_f32_e32 v2, v2
 ; VI-NEXT:    s_mov_b32 s6, 0xfffe7960
-; VI-NEXT:    v_mov_b32_e32 v9, 0
 ; VI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; VI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; VI-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2612,31 +2607,30 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
 ; VI-NEXT:    v_mul_hi_u32 v5, v6, v2
 ; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
-; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v4, vcc
+; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
-; VI-NEXT:    v_addc_u32_e32 v2, vcc, v11, v3, vcc
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
-; VI-NEXT:    s_mov_b32 s6, 0x186a0
 ; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
 ; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
 ; VI-NEXT:    v_mul_hi_u32 v8, v6, v2
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
-; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; VI-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, v9, v3, vcc
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
@@ -2649,16 +2643,17 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
 ; VI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; VI-NEXT:    v_mul_lo_u32 v6, v5, s6
-; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0
+; VI-NEXT:    s_mov_b32 s4, 0x186a0
+; VI-NEXT:    v_mul_lo_u32 v6, v5, s4
+; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
 ; VI-NEXT:    s_mov_b32 s4, 0x1869f
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; VI-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
+; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 0x186a0, v0
 ; VI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
@@ -2687,7 +2682,6 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    s_mov_b32 s6, 0xfffe7960
-; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2700,31 +2694,30 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, v6, v2
 ; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
-; GCN-NEXT:    v_add_u32_e32 v10, vcc, v5, v3
+; GCN-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v11, v3, vcc
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
+; GCN-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
-; GCN-NEXT:    s_mov_b32 s6, 0x186a0
 ; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
 ; GCN-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
 ; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
 ; GCN-NEXT:    v_mul_hi_u32 v8, v6, v2
 ; GCN-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
-; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v9, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
@@ -2737,16 +2730,17 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, v5, s6
-; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0
+; GCN-NEXT:    s_mov_b32 s4, 0x186a0
+; GCN-NEXT:    v_mul_lo_u32 v6, v5, s4
+; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
 ; GCN-NEXT:    s_mov_b32 s4, 0x1869f
 ; GCN-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
+; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, 0x186a0, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
@@ -2773,15 +2767,14 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-NEXT:    s_mov_b32 s4, 0x346d900
-; GFX1030-NEXT:    s_mov_b32 s5, 0xfffe7960
 ; GFX1030-NEXT:    s_add_u32 s4, 0x4237, s4
-; GFX1030-NEXT:    s_addc_u32 s6, 0, 0
+; GFX1030-NEXT:    s_addc_u32 s5, 0, 0
 ; GFX1030-NEXT:    v_add_co_u32 v2, s4, 0xa9000000, s4
 ; GFX1030-NEXT:    s_cmpk_lg_u32 s4, 0x0
-; GFX1030-NEXT:    s_addc_u32 s4, s6, 0xa7c5
-; GFX1030-NEXT:    v_mul_hi_u32 v3, v2, s5
-; GFX1030-NEXT:    v_mul_lo_u32 v4, v2, s5
-; GFX1030-NEXT:    s_mul_i32 s5, s4, s5
+; GFX1030-NEXT:    s_addc_u32 s4, s5, 0xa7c5
+; GFX1030-NEXT:    v_mul_hi_u32 v3, 0xfffe7960, v2
+; GFX1030-NEXT:    v_mul_lo_u32 v4, 0xfffe7960, v2
+; GFX1030-NEXT:    s_mul_i32 s5, s4, 0xfffe7960
 ; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v2
 ; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v4
 ; GFX1030-NEXT:    v_mul_hi_u32 v8, s4, v4
@@ -2804,7 +2797,6 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], s4, v1, v5, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s4, v0, v6, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], s4, v1, v6, 0
-; GFX1030-NEXT:    s_mov_b32 s4, 0x186a0
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2812,20 +2804,19 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s5, v4, s4, 0
-; GFX1030-NEXT:    v_mul_lo_u32 v6, v5, s4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s4, 0x186a0, v4, 0
+; GFX1030-NEXT:    v_mul_lo_u32 v6, 0x186a0, v5
 ; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
 ; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v6
 ; GFX1030-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX1030-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s4
-; GFX1030-NEXT:    s_mov_b32 s4, 0x1869f
+; GFX1030-NEXT:    v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0
 ; GFX1030-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, s4, v2
+; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2
+; GFX1030-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
 ; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v6, vcc_lo, v4, 2
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, s4, v0
-; GFX1030-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
+; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0
 ; GFX1030-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1030-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 20fa20a3d0735..3ed36b82ced7c 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -16,73 +16,72 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    s_subb_u32 s5, 0, s9
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, s3, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, s3, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s9, v0
@@ -159,12 +158,11 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s12
-; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
-; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
+; GCN-IR-NEXT:    s_add_u32 s15, s4, -1
+; GCN-IR-NEXT:    s_addc_u32 s16, s5, -1
 ; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
 ; GCN-IR-NEXT:    s_add_u32 s2, s2, s14
-; GCN-IR-NEXT:    s_mov_b32 s15, s11
-; GCN-IR-NEXT:    s_addc_u32 s3, s3, s11
+; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
@@ -174,8 +172,8 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s6, s16, s12
-; GCN-IR-NEXT:    s_subb_u32 s6, s17, s13
+; GCN-IR-NEXT:    s_sub_u32 s6, s15, s12
+; GCN-IR-NEXT:    s_subb_u32 s6, s16, s13
 ; GCN-IR-NEXT:    s_ashr_i32 s10, s6, 31
 ; GCN-IR-NEXT:    s_mov_b32 s11, s10
 ; GCN-IR-NEXT:    s_and_b32 s6, s10, 1
@@ -184,9 +182,9 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s11
 ; GCN-IR-NEXT:    s_add_u32 s2, s2, 1
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[2:3], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
 ; GCN-IR-NEXT:  .LBB0_4: ; %Flow6
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
@@ -219,7 +217,6 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
-; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
@@ -235,7 +232,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v11, v4, v9
 ; GCN-NEXT:    v_mul_hi_u32 v12, v4, v8
-; GCN-NEXT:    v_mul_hi_u32 v14, v5, v8
+; GCN-NEXT:    v_mul_hi_u32 v13, v5, v8
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v8
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
@@ -243,7 +240,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v14, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
@@ -265,7 +262,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v7, v5, v7
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
@@ -281,7 +278,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v2, v5
@@ -323,35 +320,33 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-LABEL: v_test_udiv_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
-; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
+; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
 ; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
+; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v10, v4, v5
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v10
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[6:7], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_min_u32_e32 v9, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[6:7], v8, v9
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v9
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v6
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v6
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
@@ -359,25 +354,25 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v2
-; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v2
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v10
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v0, v8
-; GCN-IR-NEXT:    v_not_b32_e32 v1, v9
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GCN-IR-NEXT:    v_not_b32_e32 v1, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[12:13], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
 ; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v12, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v11, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v4
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
@@ -387,8 +382,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_and_b32_e32 v8, v8, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v10, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v11, v9, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v9, s[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v7
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, v6
@@ -679,52 +674,50 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s6, 0xffff
-; GCN-NEXT:    s_mov_b32 s7, 0xff000000
-; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v2, s6
+; GCN-NEXT:    s_load_dword s6, s[0:1], 0xc
+; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v2, 0xffff
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s3, s2, s6
-; GCN-NEXT:    s_and_b32 s2, s4, s7
+; GCN-NEXT:    s_and_b32 s3, s2, 0xffff
+; GCN-NEXT:    s_and_b32 s2, s4, 0xff000000
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 24
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT:    s_load_dword s8, s[0:1], 0xb
-; GCN-NEXT:    s_load_dword s9, s[0:1], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_lshr_b64 s[0:1], s[2:3], 24
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT:    s_and_b32 s8, s6, 0xffff
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s6, s9, s6
-; GCN-NEXT:    s_and_b32 s8, s8, s7
+; GCN-NEXT:    s_and_b32 s9, s0, 0xff000000
+; GCN-NEXT:    s_lshr_b64 s[0:1], s[2:3], 24
 ; GCN-NEXT:    s_sub_u32 s0, 0, s0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_subb_u32 s1, 0, s1
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_mul_hi_u32 v3, s0, v1
-; GCN-NEXT:    v_mul_lo_u32 v4, s0, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
+; GCN-NEXT:    v_mul_lo_u32 v6, s0, v1
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v1, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v6
 ; GCN-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v7, v2, v6
+; GCN-NEXT:    v_mul_hi_u32 v6, v2, v6
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
@@ -736,32 +729,32 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v7, v1, v3
-; GCN-NEXT:    v_mul_hi_u32 v9, v1, v4
-; GCN-NEXT:    v_mul_hi_u32 v10, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_mov_b32_e32 v3, s8
+; GCN-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_alignbit_b32 v3, s6, v3, 24
+; GCN-NEXT:    v_alignbit_b32 v3, s8, v3, 24
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v1, v3, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -797,26 +790,24 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-LABEL: s_test_udiv24_i48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xc
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xb
+; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
-; GCN-IR-NEXT:    s_load_dword s7, s[0:1], 0xe
-; GCN-IR-NEXT:    s_mov_b32 s8, 0xffff
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_and_b32 s1, s2, s8
-; GCN-IR-NEXT:    s_mov_b32 s2, 0xff000000
-; GCN-IR-NEXT:    s_and_b32 s0, s3, s2
-; GCN-IR-NEXT:    s_and_b32 s3, s7, s8
-; GCN-IR-NEXT:    s_and_b32 s2, s6, s2
-; GCN-IR-NEXT:    s_lshr_b64 s[8:9], s[0:1], 24
-; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
+; GCN-IR-NEXT:    s_and_b32 s3, s2, 0xffff
+; GCN-IR-NEXT:    s_and_b32 s2, s4, 0xff000000
+; GCN-IR-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-IR-NEXT:    s_and_b32 s4, s6, 0xff000000
+; GCN-IR-NEXT:    s_lshr_b64 s[8:9], s[2:3], 24
+; GCN-IR-NEXT:    s_lshr_b64 s[4:5], s[4:5], 24
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[6:7], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s4
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s5
 ; GCN-IR-NEXT:    s_min_u32 s10, s6, s7
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s8
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
@@ -842,40 +833,39 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[8:9], s12
-; GCN-IR-NEXT:    s_add_u32 s16, s2, -1
-; GCN-IR-NEXT:    s_addc_u32 s17, s3, -1
-; GCN-IR-NEXT:    s_not_b64 s[0:1], s[10:11]
-; GCN-IR-NEXT:    s_add_u32 s8, s0, s14
-; GCN-IR-NEXT:    s_mov_b32 s15, s11
-; GCN-IR-NEXT:    s_addc_u32 s9, s1, s11
+; GCN-IR-NEXT:    s_add_u32 s15, s4, -1
+; GCN-IR-NEXT:    s_addc_u32 s16, s5, -1
+; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
+; GCN-IR-NEXT:    s_add_u32 s8, s2, s14
+; GCN-IR-NEXT:    s_addc_u32 s9, s3, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b32 s1, 0
+; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  .LBB7_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_lshr_b32 s0, s7, 31
+; GCN-IR-NEXT:    s_lshr_b32 s2, s7, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[2:3]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s0, s16, s12
-; GCN-IR-NEXT:    s_subb_u32 s0, s17, s13
-; GCN-IR-NEXT:    s_ashr_i32 s10, s0, 31
+; GCN-IR-NEXT:    s_sub_u32 s2, s15, s12
+; GCN-IR-NEXT:    s_subb_u32 s2, s16, s13
+; GCN-IR-NEXT:    s_ashr_i32 s10, s2, 31
 ; GCN-IR-NEXT:    s_mov_b32 s11, s10
-; GCN-IR-NEXT:    s_and_b32 s0, s10, 1
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], s[2:3]
+; GCN-IR-NEXT:    s_and_b32 s2, s10, 1
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s10
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s11
 ; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
 ; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[0:1]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[8:9], 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_3
 ; GCN-IR-NEXT:  .LBB7_4: ; %Flow3
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-IR-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
+; GCN-IR-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-IR-NEXT:    s_branch .LBB7_6
 ; GCN-IR-NEXT:  .LBB7_5:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
@@ -883,10 +873,10 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
 ; GCN-IR-NEXT:  .LBB7_6: ; %udiv-end
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i48 %x, 24
   %2 = lshr i48 %y, 24
@@ -908,58 +898,57 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
@@ -1092,7 +1081,6 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
-; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1105,18 +1093,18 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v9, v4, v2
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v9
+; GCN-NEXT:    v_mul_hi_u32 v7, v2, v9
+; GCN-NEXT:    v_mul_lo_u32 v8, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v11, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v10, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v10, v3, v9
 ; GCN-NEXT:    v_mul_hi_u32 v9, v3, v9
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v8, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v12, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
@@ -1138,7 +1126,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1189,22 +1177,21 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1215,10 +1202,10 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1251,12 +1238,12 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  .LBB9_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v5, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
 ; GCN-IR-NEXT:  .LBB9_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 32768, %x
   ret i64 %result
@@ -1355,8 +1342,8 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_movk_i32 s4, 0xffe8
-; GCN-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1365,25 +1352,24 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, s4
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, s4
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -1397,7 +1383,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
@@ -1405,7 +1391,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -1421,7 +1407,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, 24
@@ -1536,7 +1522,6 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    s_movk_i32 s4, 0xffe8
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1550,16 +1535,16 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, v2, v5
 ; GCN-NEXT:    v_mul_lo_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v9, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; GCN-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1571,16 +1556,16 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v9, v3, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1596,7 +1581,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, 24

diff  --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
index 17cf533d131f0..6b629ca9582f0 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
@@ -346,9 +346,8 @@ define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addr
 }
 
 ; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32:
-; SI-DAG: v_rcp_iflag_f32
-; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
-; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
+; SI: v_rcp_iflag_f32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff,
 
 ; EG: RECIP_IEEE
 define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -365,9 +364,8 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 a
 }
 
 ; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32:
-; SI-DAG: v_rcp_iflag_f32
-; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
-; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
+; SI: v_rcp_iflag_f32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff,
 
 ; EG: RECIP_IEEE
 define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index ef6e67f64a26e..35079f42ac78e 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -17,72 +17,71 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s0, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s1, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, s11, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, s11, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s10, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
@@ -161,9 +160,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
 ; GCN-IR-NEXT:    s_not_b64 s[6:7], s[10:11]
-; GCN-IR-NEXT:    s_mov_b32 s15, s11
 ; GCN-IR-NEXT:    s_add_u32 s10, s6, s14
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, s11
+; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
@@ -229,7 +227,6 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
-; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
@@ -245,7 +242,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v11, v4, v9
 ; GCN-NEXT:    v_mul_hi_u32 v12, v4, v8
-; GCN-NEXT:    v_mul_hi_u32 v14, v5, v8
+; GCN-NEXT:    v_mul_hi_u32 v13, v5, v8
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v8
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
@@ -253,7 +250,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v14, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
@@ -275,7 +272,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v7, v5, v7
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
@@ -291,7 +288,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
@@ -332,35 +329,33 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-LABEL: v_test_urem_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
-; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
+; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
 ; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
+; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v10, v4, v5
-; GCN-IR-NEXT:    v_sub_i32_e32 v5, vcc, v8, v10
-; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[6:7], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_min_u32_e32 v9, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e64 v5, s[6:7], v8, v9
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[6:7], 0, 0, s[6:7]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[5:6]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[5:6]
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v9
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v6, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
@@ -371,36 +366,36 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v7, v8
-; GCN-IR-NEXT:    v_not_b32_e32 v6, v9
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v7, v10
-; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, v6, v11, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v10
+; GCN-IR-NEXT:    v_not_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v7, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v13, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
-; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v10
-; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v11, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
@@ -731,58 +726,57 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    s_mov_b32 s8, s4
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s0, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s1, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
@@ -915,8 +909,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_movk_i32 s4, 0xffe8
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
@@ -925,63 +919,62 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v5, v1, s4
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, s4
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, s4
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v6, s3, v1
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s3, v1
 ; GCN-NEXT:    v_mul_lo_u32 v1, s3, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, 24
@@ -1111,7 +1104,6 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
-; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1124,18 +1116,18 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v9, v4, v2
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v9
+; GCN-NEXT:    v_mul_hi_u32 v7, v2, v9
+; GCN-NEXT:    v_mul_lo_u32 v8, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v11, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v10, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v10, v3, v9
 ; GCN-NEXT:    v_mul_hi_u32 v9, v3, v9
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v8, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v12, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
@@ -1157,7 +1149,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1203,26 +1195,25 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffd0, v6
+; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB8_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1233,10 +1224,10 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1268,15 +1259,15 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB8_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v4, v6
 ; GCN-IR-NEXT:  .LBB8_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v5
-; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v5
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v5
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v5
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 3c7b49d0f501d..17b357de1e783 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -31,9 +31,8 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)

diff  --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 940dc967edc07..129b93e5b0018 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -173,13 +173,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspa
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_waitcnt_depctr 0xffe3
-; GISEL-NEXT:    s_movk_i32 s0, 0x7fff
-; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
-; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
-; GISEL-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v1
+; GISEL-NEXT:    v_add_f16_e32 v2, 2.0, v2
+; GISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
 ; GISEL-NEXT:    ;;#ASMSTART
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
@@ -232,7 +231,7 @@ define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspa
 ; GISEL-NEXT:    s_mov_b32 s0, 0x8000
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT:    v_add_f16_e64 v0, s0, -v0
+; GISEL-NEXT:    v_add_f16_e64 v0, 0x8000, -v0
 ; GISEL-NEXT:    v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GISEL-NEXT:    ;;#ASMSTART

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 8d4d929e029d7..ee607351d5ba1 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -967,13 +967,12 @@ define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v2
-; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -1384,7 +1383,7 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index d85733d85363c..a5e055290d37e 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -190,8 +190,8 @@ define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, [8 x i
 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
 ; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
 
-; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
-; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
+; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039
+; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b
 define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index 01820ea1854bb..a0e46e61c6c18 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -51,9 +51,8 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a,
 ; GCN: s_load_dword [[A:s[0-9]+]]
 ; GCN: s_load_dword [[B:s[0-9]+]]
 
-; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]]
-; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]]
+; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
+; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
 ; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]]
 ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]


        


More information about the llvm-commits mailing list