[llvm] 0776f6e - [LSV] Vectorize loads of vectors by turning it into a larger vector

Benjamin Kramer via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 26 02:38:55 PST 2022


Author: Benjamin Kramer
Date: 2022-01-26T11:38:41+01:00
New Revision: 0776f6e04d8c3823c31b8fa6fa7d1376a7009af1

URL: https://github.com/llvm/llvm-project/commit/0776f6e04d8c3823c31b8fa6fa7d1376a7009af1
DIFF: https://github.com/llvm/llvm-project/commit/0776f6e04d8c3823c31b8fa6fa7d1376a7009af1.diff

LOG: [LSV] Vectorize loads of vectors by turning it into a larger vector

Use shufflevector to do the subvector extracts. This allows a lot more
load merging on AMDGPU and also on NVPTX when <2 x half> is involved.

Differential Revision: https://reviews.llvm.org/D117219

Added: 
    llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
    llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
    llvm/test/CodeGen/AMDGPU/fshl.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/half.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
    llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/memory_clause.ll
    llvm/test/CodeGen/AMDGPU/min.ll
    llvm/test/CodeGen/AMDGPU/mul_int24.ll
    llvm/test/CodeGen/AMDGPU/sdiv.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/select-vectors.ll
    llvm/test/CodeGen/AMDGPU/shift-i128.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/test/CodeGen/AMDGPU/sra.ll
    llvm/test/CodeGen/AMDGPU/srl.ll
    llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index d2e0d1d474b0e..97c2acb7d4c76 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -854,13 +854,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
           (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
         continue;
 
-      // Make sure all the users of a vector are constant-index extracts.
-      if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
-            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
-            return EEI && isa<ConstantInt>(EEI->getOperand(1));
-          }))
-        continue;
-
       // Save the load locations.
       const ChainID ID = getChainID(Ptr);
       LoadRefs[ID].push_back(LI);
@@ -901,12 +894,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
           (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
         continue;
 
-      if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
-            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
-            return EEI && isa<ConstantInt>(EEI->getOperand(1));
-          }))
-        continue;
-
       // Save store location.
       const ChainID ID = getChainID(Ptr);
       StoreRefs[ID].push_back(SI);
@@ -1290,52 +1277,32 @@ bool Vectorizer::vectorizeLoadChain(
       Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
   propagateMetadata(LI, Chain);
 
-  if (VecLoadTy) {
-    SmallVector<Instruction *, 16> InstrsToErase;
-
-    unsigned VecWidth = VecLoadTy->getNumElements();
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
-      for (auto Use : Chain[I]->users()) {
-        // All users of vector loads are ExtractElement instructions with
-        // constant indices, otherwise we would have bailed before now.
-        Instruction *UI = cast<Instruction>(Use);
-        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
-        unsigned NewIdx = Idx + I * VecWidth;
-        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
-                                                UI->getName());
-        if (V->getType() != UI->getType())
-          V = Builder.CreateBitCast(V, UI->getType());
-
-        // Replace the old instruction.
-        UI->replaceAllUsesWith(V);
-        InstrsToErase.push_back(UI);
-      }
+  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+    Value *CV = Chain[I];
+    Value *V;
+    if (VecLoadTy) {
+      // Extract a subvector using shufflevector.
+      unsigned VecWidth = VecLoadTy->getNumElements();
+      auto Mask =
+          llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
+      V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
+    } else {
+      V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
     }
 
-    // Bitcast might not be an Instruction, if the value being loaded is a
-    // constant.  In that case, no need to reorder anything.
-    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
-      reorder(BitcastInst);
-
-    for (auto I : InstrsToErase)
-      I->eraseFromParent();
-  } else {
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
-      Value *CV = Chain[I];
-      Value *V =
-          Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
-      if (V->getType() != CV->getType()) {
-        V = Builder.CreateBitOrPointerCast(V, CV->getType());
-      }
-
-      // Replace the old instruction.
-      CV->replaceAllUsesWith(V);
+    if (V->getType() != CV->getType()) {
+      V = Builder.CreateBitOrPointerCast(V, CV->getType());
     }
 
-    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
-      reorder(BitcastInst);
+    // Replace the old instruction.
+    CV->replaceAllUsesWith(V);
   }
 
+  // Bitcast might not be an Instruction, if the value being loaded is a
+  // constant. In that case, no need to reorder anything.
+  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+    reorder(BitcastInst);
+
   eraseInstructions(Chain);
 
   ++NumVectorInstructions;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 62b01dc0ee3e9..25eae693c1634 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -667,69 +667,68 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32> addrspace(1)* %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x18
-; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
-; GFX8-NEXT:    s_add_i32 s0, s0, s8
-; GFX8-NEXT:    s_xor_b32 s9, s0, s8
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX8-NEXT:    s_ashr_i32 s11, s1, 31
-; GFX8-NEXT:    s_add_i32 s0, s1, s11
-; GFX8-NEXT:    s_sub_i32 s1, 0, s9
+; GFX8-NEXT:    s_ashr_i32 s2, s10, 31
+; GFX8-NEXT:    s_add_i32 s0, s10, s2
+; GFX8-NEXT:    s_xor_b32 s3, s0, s2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX8-NEXT:    s_sub_i32 s1, 0, s3
+; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX8-NEXT:    s_add_i32 s0, s11, s12
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_xor_b32 s12, s0, s11
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s12
-; GFX8-NEXT:    s_ashr_i32 s10, s2, 31
+; GFX8-NEXT:    s_xor_b32 s11, s0, s12
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s11
+; GFX8-NEXT:    s_ashr_i32 s10, s8, 31
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_add_i32 s0, s2, s10
+; GFX8-NEXT:    s_add_i32 s0, s8, s10
 ; GFX8-NEXT:    s_xor_b32 s0, s0, s10
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
-; GFX8-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX8-NEXT:    s_sub_i32 s8, 0, s11
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s9
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s3
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s9, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s9, v2
-; GFX8-NEXT:    s_sub_i32 s0, 0, s12
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
-; GFX8-NEXT:    s_add_i32 s1, s3, s2
-; GFX8-NEXT:    s_xor_b32 s1, s1, s2
-; GFX8-NEXT:    s_xor_b32 s0, s10, s8
+; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX8-NEXT:    s_xor_b32 s0, s10, s2
+; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX8-NEXT:    s_add_i32 s1, s9, s2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s10, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v2
-; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s12
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
-; GFX8-NEXT:    s_xor_b32 s0, s2, s11
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
+; GFX8-NEXT:    s_xor_b32 s0, s2, s12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
@@ -745,74 +744,73 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ;
 ; GFX9-LABEL: sdivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x18
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s10, s6, 31
-; GFX9-NEXT:    s_add_i32 s0, s6, s10
-; GFX9-NEXT:    s_xor_b32 s6, s0, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s10, 31
+; GFX9-NEXT:    s_add_i32 s0, s10, s6
+; GFX9-NEXT:    s_xor_b32 s7, s0, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
-; GFX9-NEXT:    s_add_i32 s7, s7, s5
+; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
+; GFX9-NEXT:    s_add_i32 s5, s11, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s7, s7, s5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_i32 s11, 0, s6
+; GFX9-NEXT:    s_xor_b32 s5, s5, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GFX9-NEXT:    s_sub_i32 s11, 0, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_ashr_i32 s4, s8, 31
-; GFX9-NEXT:    s_add_i32 s8, s8, s4
+; GFX9-NEXT:    s_ashr_i32 s10, s8, 31
+; GFX9-NEXT:    s_add_i32 s8, s8, s10
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_xor_b32 s8, s8, s4
+; GFX9-NEXT:    s_xor_b32 s8, s8, s10
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_sub_i32 s12, 0, s7
+; GFX9-NEXT:    s_sub_i32 s11, 0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v1
 ; GFX9-NEXT:    s_ashr_i32 s11, s9, 31
-; GFX9-NEXT:    s_add_i32 s9, s9, s11
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
+; GFX9-NEXT:    s_add_i32 s9, s9, s11
 ; GFX9-NEXT:    s_xor_b32 s9, s9, s11
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
-; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s7
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s7
+; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s7, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s5
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
-; GFX9-NEXT:    s_xor_b32 s6, s4, s10
+; GFX9-NEXT:    s_xor_b32 s6, s10, s6
+; GFX9-NEXT:    s_xor_b32 s4, s11, s4
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
-; GFX9-NEXT:    s_xor_b32 s4, s11, s5
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
@@ -821,34 +819,32 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ;
 ; GFX10-LABEL: sdivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x18
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s2, s0, 31
-; GFX10-NEXT:    s_ashr_i32 s3, s1, 31
-; GFX10-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-NEXT:    s_add_i32 s1, s1, s3
-; GFX10-NEXT:    s_xor_b32 s8, s0, s2
-; GFX10-NEXT:    s_xor_b32 s9, s1, s3
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX10-NEXT:    s_sub_i32 s6, 0, s8
-; GFX10-NEXT:    s_sub_i32 s7, 0, s9
+; GFX10-NEXT:    s_ashr_i32 s8, s2, 31
+; GFX10-NEXT:    s_ashr_i32 s9, s3, 31
+; GFX10-NEXT:    s_add_i32 s2, s2, s8
+; GFX10-NEXT:    s_add_i32 s3, s3, s9
+; GFX10-NEXT:    s_xor_b32 s2, s2, s8
+; GFX10-NEXT:    s_xor_b32 s3, s3, s9
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX10-NEXT:    s_sub_i32 s6, 0, s2
+; GFX10-NEXT:    s_sub_i32 s7, 0, s3
+; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    s_ashr_i32 s11, s1, 31
+; GFX10-NEXT:    s_add_i32 s0, s0, s10
+; GFX10-NEXT:    s_add_i32 s1, s1, s11
+; GFX10-NEXT:    s_xor_b32 s0, s0, s10
+; GFX10-NEXT:    s_xor_b32 s1, s1, s11
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
-; GFX10-NEXT:    s_ashr_i32 s11, s1, 31
-; GFX10-NEXT:    s_add_i32 s0, s0, s10
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
-; GFX10-NEXT:    s_add_i32 s1, s1, s11
-; GFX10-NEXT:    s_xor_b32 s0, s0, s10
-; GFX10-NEXT:    s_xor_b32 s1, s1, s11
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
@@ -856,32 +852,32 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s8
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s9
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s1, v3
-; GFX10-NEXT:    s_xor_b32 s1, s10, s2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s8, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
+; GFX10-NEXT:    s_xor_b32 s1, s10, s8
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s8, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s11, s3
+; GFX10-NEXT:    s_xor_b32 s0, s11, s9
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s10, v2
@@ -905,142 +901,141 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f7ffffe
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i32 s12, s0, 31
-; GFX8-NEXT:    s_add_i32 s0, s0, s12
-; GFX8-NEXT:    s_xor_b32 s13, s0, s12
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
-; GFX8-NEXT:    s_ashr_i32 s15, s1, 31
-; GFX8-NEXT:    s_add_i32 s0, s1, s15
-; GFX8-NEXT:    s_sub_i32 s1, 0, s13
+; GFX8-NEXT:    s_ashr_i32 s2, s12, 31
+; GFX8-NEXT:    s_add_i32 s0, s12, s2
+; GFX8-NEXT:    s_xor_b32 s3, s0, s2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX8-NEXT:    s_sub_i32 s1, 0, s3
+; GFX8-NEXT:    s_ashr_i32 s16, s13, 31
+; GFX8-NEXT:    s_add_i32 s0, s13, s16
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_xor_b32 s16, s0, s15
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s16
-; GFX8-NEXT:    s_ashr_i32 s14, s8, 31
+; GFX8-NEXT:    s_xor_b32 s13, s0, s16
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s13
+; GFX8-NEXT:    s_ashr_i32 s12, s8, 31
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_add_i32 s0, s8, s14
-; GFX8-NEXT:    s_xor_b32 s0, s0, s14
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX8-NEXT:    s_add_i32 s0, s8, s12
+; GFX8-NEXT:    s_xor_b32 s0, s0, s12
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
-; GFX8-NEXT:    s_ashr_i32 s8, s9, 31
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, v2, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, v3, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s13
+; GFX8-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s0, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s13, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s13, v2
-; GFX8-NEXT:    s_sub_i32 s0, 0, s16
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
+; GFX8-NEXT:    s_sub_i32 s0, 0, s13
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v1
-; GFX8-NEXT:    s_add_i32 s1, s9, s8
-; GFX8-NEXT:    s_xor_b32 s1, s1, s8
-; GFX8-NEXT:    s_xor_b32 s0, s14, s12
+; GFX8-NEXT:    s_xor_b32 s0, s12, s2
+; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX8-NEXT:    s_add_i32 s1, s9, s2
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, s14, v2
-; GFX8-NEXT:    s_ashr_i32 s9, s2, 31
+; GFX8-NEXT:    v_xor_b32_e32 v3, s12, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
+; GFX8-NEXT:    s_ashr_i32 s3, s14, 31
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s14, v2
-; GFX8-NEXT:    v_mul_lo_u32 v5, v1, s16
-; GFX8-NEXT:    s_add_i32 s0, s2, s9
-; GFX8-NEXT:    s_xor_b32 s2, s0, s9
-; GFX8-NEXT:    s_ashr_i32 s12, s10, 31
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v5
+; GFX8-NEXT:    v_mul_lo_u32 v5, v1, s13
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s12, v3
+; GFX8-NEXT:    s_add_i32 s0, s14, s3
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v5
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s16, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
+; GFX8-NEXT:    s_xor_b32 s8, s0, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s8
+; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s13, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s16, v2
-; GFX8-NEXT:    s_sub_i32 s0, 0, s2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s13, v3
+; GFX8-NEXT:    s_sub_i32 s0, 0, s8
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v6, s0, v5
-; GFX8-NEXT:    s_add_i32 s1, s10, s12
-; GFX8-NEXT:    s_xor_b32 s1, s1, s12
-; GFX8-NEXT:    s_xor_b32 s0, s8, s15
+; GFX8-NEXT:    s_ashr_i32 s9, s10, 31
+; GFX8-NEXT:    s_add_i32 s1, s10, s9
+; GFX8-NEXT:    s_xor_b32 s1, s1, s9
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX8-NEXT:    v_xor_b32_e32 v2, s8, v2
+; GFX8-NEXT:    s_xor_b32 s0, s2, s16
+; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v5
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v2
-; GFX8-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX8-NEXT:    v_mul_lo_u32 v7, v6, s2
-; GFX8-NEXT:    s_add_i32 s0, s3, s8
-; GFX8-NEXT:    s_xor_b32 s3, s0, s8
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s2, v3
+; GFX8-NEXT:    s_ashr_i32 s2, s15, 31
+; GFX8-NEXT:    v_mul_lo_u32 v7, v6, s8
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT:    s_add_i32 s0, s15, s2
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v7
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v6
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    s_xor_b32 s10, s0, s2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, s3
-; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, s10
+; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s8, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s2, v2
-; GFX8-NEXT:    s_sub_i32 s0, 0, s3
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v2, v7, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v3
-; GFX8-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX8-NEXT:    s_add_i32 s1, s11, s2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    v_mul_f32_e32 v2, v7, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s8, v3
+; GFX8-NEXT:    s_sub_i32 s0, 0, s10
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v7, s0, v2
+; GFX8-NEXT:    s_xor_b32 s0, s9, s3
+; GFX8-NEXT:    s_ashr_i32 s3, s11, 31
+; GFX8-NEXT:    s_add_i32 s1, s11, s3
+; GFX8-NEXT:    v_mul_hi_u32 v7, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX8-NEXT:    s_xor_b32 s1, s1, s2
-; GFX8-NEXT:    s_xor_b32 s0, s12, s9
+; GFX8-NEXT:    s_xor_b32 s1, s1, s3
 ; GFX8-NEXT:    v_xor_b32_e32 v6, s0, v6
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
+; GFX8-NEXT:    v_mul_hi_u32 v7, s1, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, s9, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v6
-; GFX8-NEXT:    v_xor_b32_e32 v6, s12, v7
-; GFX8-NEXT:    v_mul_lo_u32 v7, v3, s3
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s12, v6
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v3
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s1, v7
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s3, v7
+; GFX8-NEXT:    v_mul_lo_u32 v8, v7, s10
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s9, v3
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v7
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v7
+; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s10, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s3, v7
-; GFX8-NEXT:    s_xor_b32 s0, s2, s8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v7
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s10, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
+; GFX8-NEXT:    s_xor_b32 s0, s3, s2
+; GFX8-NEXT:    v_xor_b32_e32 v3, s0, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, s3, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s5
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s4
-; GFX8-NEXT:    v_xor_b32_e32 v7, s2, v7
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s2, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s3, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
@@ -1048,7 +1043,7 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ;
 ; GFX9-LABEL: sdivrem_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s6, s12, 31
@@ -1056,116 +1051,114 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-NEXT:    s_xor_b32 s7, s0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
-; GFX9-NEXT:    s_ashr_i32 s4, s13, 31
-; GFX9-NEXT:    s_add_i32 s5, s13, s4
+; GFX9-NEXT:    s_ashr_i32 s5, s13, 31
+; GFX9-NEXT:    s_add_i32 s12, s13, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s12, 0, s7
-; GFX9-NEXT:    s_xor_b32 s5, s5, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GFX9-NEXT:    s_xor_b32 s12, s12, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX9-NEXT:    s_sub_i32 s13, 0, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s13, 0, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s12, s8, 31
-; GFX9-NEXT:    s_add_i32 s8, s8, s12
-; GFX9-NEXT:    s_xor_b32 s8, s8, s12
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX9-NEXT:    s_ashr_i32 s4, s8, 31
+; GFX9-NEXT:    s_add_i32 s8, s8, s4
+; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_xor_b32 s6, s12, s6
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v1
+; GFX9-NEXT:    s_xor_b32 s8, s8, s4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX9-NEXT:    s_sub_i32 s16, 0, s12
 ; GFX9-NEXT:    s_ashr_i32 s13, s9, 31
 ; GFX9-NEXT:    s_add_i32 s9, s9, s13
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s16, v1
+; GFX9-NEXT:    s_xor_b32 s9, s9, s13
+; GFX9-NEXT:    s_xor_b32 s6, s4, s6
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s7
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    s_xor_b32 s4, s13, s4
+; GFX9-NEXT:    s_xor_b32 s5, s13, s5
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v4
-; GFX9-NEXT:    s_xor_b32 s7, s9, s13
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s12
+; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s4, v3
+; GFX9-NEXT:    s_ashr_i32 s4, s14, 31
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s5
-; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
-; GFX9-NEXT:    s_ashr_i32 s6, s14, 31
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s12, v3
-; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v5
-; GFX9-NEXT:    s_add_i32 s7, s14, s6
-; GFX9-NEXT:    s_xor_b32 s7, s7, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GFX9-NEXT:    s_add_i32 s6, s14, s4
+; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s12, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
-; GFX9-NEXT:    s_sub_i32 s8, 0, s7
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX9-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v5
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
-; GFX9-NEXT:    s_ashr_i32 s4, s15, 31
-; GFX9-NEXT:    s_add_i32 s9, s15, s4
+; GFX9-NEXT:    v_mul_lo_u32 v6, s7, v5
+; GFX9-NEXT:    v_xor_b32_e32 v1, s5, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s5, v1
+; GFX9-NEXT:    s_ashr_i32 s5, s15, 31
+; GFX9-NEXT:    s_add_i32 s9, s15, s5
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX9-NEXT:    s_xor_b32 s9, s9, s4
+; GFX9-NEXT:    s_xor_b32 s9, s9, s5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s9
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s5, v3
-; GFX9-NEXT:    s_ashr_i32 s5, s10, 31
-; GFX9-NEXT:    s_add_i32 s8, s10, s5
-; GFX9-NEXT:    s_xor_b32 s8, s8, s5
+; GFX9-NEXT:    s_ashr_i32 s7, s10, 31
+; GFX9-NEXT:    s_add_i32 s8, s10, s7
+; GFX9-NEXT:    s_xor_b32 s8, s8, s7
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s12, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, s13, v3
-; GFX9-NEXT:    v_mul_lo_u32 v7, v6, s7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v6, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v8, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s13, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s13, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v7
 ; GFX9-NEXT:    s_sub_i32 s8, 0, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s8, v2
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v6
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v8
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v6
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s7, v3
-; GFX9-NEXT:    s_ashr_i32 s7, s11, 31
-; GFX9-NEXT:    s_add_i32 s8, s11, s7
-; GFX9-NEXT:    s_xor_b32 s8, s8, s7
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s6, v3
+; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
+; GFX9-NEXT:    s_add_i32 s8, s11, s6
+; GFX9-NEXT:    s_xor_b32 s8, s8, s6
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX9-NEXT:    s_xor_b32 s6, s5, s6
-; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
-; GFX9-NEXT:    v_mul_lo_u32 v7, v8, s9
-; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v3
 ; GFX9-NEXT:    s_xor_b32 s4, s7, s4
+; GFX9-NEXT:    v_xor_b32_e32 v3, s7, v3
+; GFX9-NEXT:    v_mul_lo_u32 v7, v8, s9
+; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v7
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v8
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
@@ -1176,148 +1169,147 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, s9, v3
+; GFX9-NEXT:    s_xor_b32 s4, s6, s5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v7
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_xor_b32_e32 v7, s7, v8
+; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s7, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s6, v7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0x4f7ffffe
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s12, s8, 31
-; GFX10-NEXT:    s_ashr_i32 s14, s10, 31
-; GFX10-NEXT:    s_add_i32 s6, s8, s12
-; GFX10-NEXT:    s_add_i32 s8, s10, s14
-; GFX10-NEXT:    s_xor_b32 s10, s6, s12
-; GFX10-NEXT:    s_ashr_i32 s13, s9, 31
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX10-NEXT:    s_ashr_i32 s15, s11, 31
-; GFX10-NEXT:    s_add_i32 s7, s9, s13
-; GFX10-NEXT:    s_add_i32 s9, s11, s15
-; GFX10-NEXT:    s_xor_b32 s11, s7, s13
-; GFX10-NEXT:    s_xor_b32 s8, s8, s14
+; GFX10-NEXT:    s_ashr_i32 s0, s12, 31
+; GFX10-NEXT:    s_ashr_i32 s2, s14, 31
+; GFX10-NEXT:    s_add_i32 s6, s12, s0
+; GFX10-NEXT:    s_add_i32 s12, s14, s2
+; GFX10-NEXT:    s_xor_b32 s14, s6, s0
+; GFX10-NEXT:    s_ashr_i32 s1, s13, 31
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX10-NEXT:    s_ashr_i32 s3, s15, 31
+; GFX10-NEXT:    s_add_i32 s7, s13, s1
+; GFX10-NEXT:    s_add_i32 s13, s15, s3
+; GFX10-NEXT:    s_xor_b32 s15, s7, s1
+; GFX10-NEXT:    s_xor_b32 s12, s12, s2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s12
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s8
-; GFX10-NEXT:    s_xor_b32 s9, s9, s15
-; GFX10-NEXT:    s_sub_i32 s6, 0, s10
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX10-NEXT:    s_xor_b32 s13, s13, s3
+; GFX10-NEXT:    s_sub_i32 s6, 0, s14
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s13
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX10-NEXT:    s_sub_i32 s7, 0, s11
-; GFX10-NEXT:    s_sub_i32 s19, 0, s8
+; GFX10-NEXT:    s_sub_i32 s7, 0, s15
+; GFX10-NEXT:    s_sub_i32 s19, 0, s12
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX10-NEXT:    s_ashr_i32 s16, s8, 31
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT:    s_ashr_i32 s16, s0, 31
-; GFX10-NEXT:    s_ashr_i32 s17, s1, 31
-; GFX10-NEXT:    s_add_i32 s0, s0, s16
-; GFX10-NEXT:    s_ashr_i32 s18, s2, 31
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    s_ashr_i32 s18, s10, 31
+; GFX10-NEXT:    s_ashr_i32 s17, s9, 31
+; GFX10-NEXT:    s_xor_b32 s20, s16, s0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v4
-; GFX10-NEXT:    s_xor_b32 s0, s0, s16
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    s_xor_b32 s21, s17, s1
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT:    s_sub_i32 s6, 0, s9
+; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v0
+; GFX10-NEXT:    s_sub_i32 s6, 0, s13
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX10-NEXT:    s_add_i32 s1, s1, s17
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s7, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s19, v2
-; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v4
+; GFX10-NEXT:    s_ashr_i32 s19, s11, 31
+; GFX10-NEXT:    s_add_i32 s7, s9, s17
 ; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v3
-; GFX10-NEXT:    s_add_i32 s2, s2, s18
-; GFX10-NEXT:    s_ashr_i32 s19, s3, 31
-; GFX10-NEXT:    s_xor_b32 s1, s1, s17
-; GFX10-NEXT:    s_xor_b32 s2, s2, s18
+; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v4
+; GFX10-NEXT:    s_add_i32 s6, s8, s16
+; GFX10-NEXT:    s_add_i32 s8, s10, s18
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v4
+; GFX10-NEXT:    s_xor_b32 s10, s6, s16
+; GFX10-NEXT:    s_add_i32 s9, s11, s19
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX10-NEXT:    s_add_i32 s3, s3, s19
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    s_xor_b32 s3, s3, s19
-; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v4
+; GFX10-NEXT:    s_xor_b32 s11, s7, s17
+; GFX10-NEXT:    s_xor_b32 s8, s8, s18
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v6
-; GFX10-NEXT:    s_xor_b32 s12, s16, s12
+; GFX10-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GFX10-NEXT:    s_xor_b32 s9, s9, s19
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v7
-; GFX10-NEXT:    s_xor_b32 s13, s17, s13
-; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX10-NEXT:    v_mul_hi_u32 v2, s2, v2
-; GFX10-NEXT:    v_mul_lo_u32 v4, v0, s10
-; GFX10-NEXT:    v_mul_hi_u32 v3, s3, v3
+; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
+; GFX10-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GFX10-NEXT:    s_xor_b32 s22, s18, s2
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_mul_hi_u32 v3, s9, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, v0, s14
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
-; GFX10-NEXT:    s_xor_b32 s14, s18, s14
-; GFX10-NEXT:    v_mul_lo_u32 v5, v1, s11
-; GFX10-NEXT:    v_mul_lo_u32 v6, v2, s8
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s0, v4
-; GFX10-NEXT:    v_mul_lo_u32 v7, v3, s9
+; GFX10-NEXT:    v_mul_lo_u32 v5, v1, s15
+; GFX10-NEXT:    v_mul_lo_u32 v6, v2, s12
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
+; GFX10-NEXT:    v_mul_lo_u32 v7, v3, s13
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s10, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s1, v5
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s2, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s3, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s11, v5
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s8, v6
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s9, v7
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v5
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s12, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s10, v4
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v5
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s8, v6
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s9, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s14, v4
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s13, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s11, v5
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s15, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s8, v6
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s12, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s9, v7
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s13, v7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s10, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v5
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s8, v6
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s14, v4
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s12, v6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v7
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s13, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s11, v5
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s15, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s8, v6
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v12, s9, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s12, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s13, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
-; GFX10-NEXT:    s_xor_b32 s0, s19, s15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v0, s12, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, s13, v1
-; GFX10-NEXT:    v_xor_b32_e32 v2, s14, v2
+; GFX10-NEXT:    s_xor_b32 s0, s19, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s2
+; GFX10-NEXT:    v_xor_b32_e32 v0, s20, v0
+; GFX10-NEXT:    v_xor_b32_e32 v1, s21, v1
+; GFX10-NEXT:    v_xor_b32_e32 v2, s22, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v4, s16, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v5, s17, v5
 ; GFX10-NEXT:    v_xor_b32_e32 v6, s18, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v7, s19, v7
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s12, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s13, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s14, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s20, v0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s21, v1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s22, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, s16, v4
@@ -1338,27 +1330,26 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64> addrspace(1)* %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: sdivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i32 s6, s9, 31
-; GFX8-NEXT:    s_ashr_i32 s12, s1, 31
-; GFX8-NEXT:    s_add_u32 s14, s8, s6
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    s_and_b32 s7, s7, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX8-NEXT:    s_addc_u32 s15, s9, s6
-; GFX8-NEXT:    s_add_u32 s0, s0, s12
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    s_and_b32 s7, s7, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX8-NEXT:    s_mov_b32 s13, s12
-; GFX8-NEXT:    s_addc_u32 s1, s1, s12
-; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
+; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX8-NEXT:    s_ashr_i32 s6, s13, 31
+; GFX8-NEXT:    s_add_u32 s0, s8, s2
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_addc_u32 s1, s9, s2
+; GFX8-NEXT:    s_add_u32 s8, s12, s6
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    s_mov_b32 s7, s6
+; GFX8-NEXT:    s_addc_u32 s9, s13, s6
+; GFX8-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX8-NEXT:    s_mov_b32 s7, s6
-; GFX8-NEXT:    s_xor_b64 s[14:15], s[14:15], s[6:7]
+; GFX8-NEXT:    s_mov_b32 s3, s2
+; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -1433,24 +1424,24 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v2, s15, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, s14, v1
-; GFX8-NEXT:    v_mul_hi_u32 v5, s14, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, s15, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s15
+; GFX8-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT:    v_mul_hi_u32 v5, s12, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s13
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v5, s15, v1
+; GFX8-NEXT:    v_mul_lo_u32 v5, s13, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mul_hi_u32 v3, s14, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, s12, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s15, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, s13, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -1461,9 +1452,9 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v5, s8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s14, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s12, v5
 ; GFX8-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s15, v2
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s13, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
@@ -1494,37 +1485,37 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[6:7]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT:    s_ashr_i32 s8, s11, 31
-; GFX8-NEXT:    s_ashr_i32 s12, s3, 31
+; GFX8-NEXT:    s_ashr_i32 s6, s11, 31
+; GFX8-NEXT:    s_ashr_i32 s8, s15, 31
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    s_add_u32 s0, s10, s8
+; GFX8-NEXT:    s_add_u32 s0, s10, s6
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_addc_u32 s1, s11, s8
-; GFX8-NEXT:    s_add_u32 s2, s2, s12
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    s_and_b32 s7, s7, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX8-NEXT:    s_mov_b32 s13, s12
-; GFX8-NEXT:    s_addc_u32 s3, s3, s12
-; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX8-NEXT:    s_addc_u32 s1, s11, s6
+; GFX8-NEXT:    s_add_u32 s10, s14, s8
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    s_mov_b32 s9, s8
-; GFX8-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
+; GFX8-NEXT:    s_addc_u32 s11, s15, s8
+; GFX8-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s11
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s10
+; GFX8-NEXT:    s_mov_b32 s7, s6
+; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GFX8-NEXT:    s_sub_u32 s0, 0, s2
+; GFX8-NEXT:    s_sub_u32 s0, 0, s10
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
@@ -1535,19 +1526,19 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_subb_u32 s1, 0, s3
+; GFX8-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s1, v7
 ; GFX8-NEXT:    v_mul_lo_u32 v8, s0, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v10, s0, v7
 ; GFX8-NEXT:    v_mul_lo_u32 v9, s0, v7
-; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v10
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_xor_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:    v_mul_lo_u32 v10, v6, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v11, v7, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s2, v3
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v2, v5, vcc
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v7, v9
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v10, v11
@@ -1574,14 +1565,13 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s0, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v9, s0, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v8, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v10, s3
+; GFX8-NEXT:    v_mov_b32_e32 v10, s11
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v3, v8
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v2, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v11, v2, v8
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v8
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
@@ -1601,51 +1591,51 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX8-NEXT:    v_mul_hi_u32 v9, s10, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mul_lo_u32 v6, s13, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s12, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, s12, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s13
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v9, s11, v3
+; GFX8-NEXT:    v_mul_lo_u32 v9, s13, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_mul_hi_u32 v7, s10, v3
+; GFX8-NEXT:    v_mul_hi_u32 v7, s12, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_mul_hi_u32 v3, s11, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, s13, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
-; GFX8-NEXT:    v_mul_lo_u32 v6, s3, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s2, v3
-; GFX8-NEXT:    v_mul_hi_u32 v11, s2, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, s2, v2
+; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
+; GFX8-NEXT:    v_mul_hi_u32 v11, s10, v2
+; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s10, v9
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s12, v9
 ; GFX8-NEXT:    v_subb_u32_e64 v8, s[0:1], v8, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s11, v6
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s13, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s2, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s10, v7
 ; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v10, vcc
@@ -1653,61 +1643,61 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v14
 ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s10, v11
 ; GFX8-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
+; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v7, s8, v7
-; GFX8-NEXT:    v_xor_b32_e32 v8, s8, v6
-; GFX8-NEXT:    v_mov_b32_e32 v9, s8
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
+; GFX8-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX8-NEXT:    v_mov_b32_e32 v9, s6
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v7
 ; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, s5
-; GFX8-NEXT:    v_mov_b32_e32 v8, s4
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    v_mov_b32_e32 v9, s13
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s14
+; GFX8-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdivrem_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
-; GFX9-NEXT:    s_ashr_i32 s12, s1, 31
-; GFX9-NEXT:    s_add_u32 s14, s8, s6
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    s_and_b32 s7, s7, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX9-NEXT:    s_addc_u32 s15, s9, s6
-; GFX9-NEXT:    s_add_u32 s0, s0, s12
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    s_and_b32 s7, s7, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    s_addc_u32 s1, s1, s12
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
+; GFX9-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX9-NEXT:    s_ashr_i32 s6, s13, 31
+; GFX9-NEXT:    s_add_u32 s0, s8, s2
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_addc_u32 s1, s9, s2
+; GFX9-NEXT:    s_add_u32 s8, s12, s6
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_mov_b32 s7, s6
+; GFX9-NEXT:    s_addc_u32 s9, s13, s6
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    s_xor_b64 s[14:15], s[14:15], s[6:7]
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -1727,7 +1717,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s15
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -1778,19 +1768,19 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, s15, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, s13, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1804,10 +1794,10 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s14, v6
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s12, v6
 ; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v7, v2, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
-; GFX9-NEXT:    v_sub_u32_e32 v2, s15, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s13, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
@@ -1832,35 +1822,35 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
-; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
-; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
-; GFX9-NEXT:    s_add_u32 s10, s10, s8
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    s_and_b32 s7, s7, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX9-NEXT:    s_addc_u32 s11, s11, s8
-; GFX9-NEXT:    s_add_u32 s2, s2, s12
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    s_and_b32 s7, s7, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    s_addc_u32 s3, s3, s12
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[6:7]
+; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
+; GFX9-NEXT:    s_ashr_i32 s8, s15, 31
+; GFX9-NEXT:    s_add_u32 s12, s10, s6
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_addc_u32 s13, s11, s6
+; GFX9-NEXT:    s_add_u32 s10, s14, s8
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    s_addc_u32 s11, s15, s8
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
+; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v6
 ; GFX9-NEXT:    v_add_f32_e32 v4, v4, v7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
-; GFX9-NEXT:    s_sub_u32 s7, 0, s2
+; GFX9-NEXT:    s_mov_b32 s7, s6
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[12:13], s[6:7]
+; GFX9-NEXT:    s_sub_u32 s3, 0, s10
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
@@ -1871,12 +1861,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_and_b32 s14, s14, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX9-NEXT:    s_subb_u32 s14, 0, s3
+; GFX9-NEXT:    s_subb_u32 s14, 0, s11
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, s7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, s7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, s3, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v7, v8
@@ -1907,17 +1897,17 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, s7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, s7, v4
-; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
-; GFX9-NEXT:    v_xor_b32_e32 v3, s6, v3
-; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, s3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v4
+; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v7, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v5, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GFX9-NEXT:    v_mov_b32_e32 v10, s6
+; GFX9-NEXT:    v_mov_b32_e32 v10, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v11
@@ -1936,20 +1926,20 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add3_u32 v6, v9, v8, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v4, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v5, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v7
-; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, s13, v7
+; GFX9-NEXT:    v_mul_lo_u32 v9, s12, v6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s2, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v2, v10, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v7
+; GFX9-NEXT:    v_mul_hi_u32 v2, s12, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v6
-; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v7
+; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v6
+; GFX9-NEXT:    v_mul_hi_u32 v7, s13, v7
 ; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, s12, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, s13, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
@@ -1958,30 +1948,30 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v7, v3, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, s3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v10, s2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v11, s11
-; GFX9-NEXT:    v_mov_b32_e32 v9, s3
+; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v2
+; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v3
+; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v2
+; GFX9-NEXT:    v_mul_lo_u32 v10, s10, v2
+; GFX9-NEXT:    v_mov_b32_e32 v11, s13
+; GFX9-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v7, v8
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v10
+; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s12, v10
 ; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v11, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
-; GFX9-NEXT:    v_sub_u32_e32 v6, s11, v6
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
+; GFX9-NEXT:    v_sub_u32_e32 v6, s13, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s2, v7
+; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s10, v7
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
@@ -1991,96 +1981,94 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s10, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v7, s8, v7
+; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-NEXT:    v_xor_b32_e32 v8, s8, v6
-; GFX9-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s8, v7
+; GFX9-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT:    v_mov_b32_e32 v9, s6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[4:5]
-; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[6:7]
+; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[12:13]
+; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[14:15]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s12, s9, 31
-; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
-; GFX10-NEXT:    s_add_u32 s14, s8, s12
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX10-NEXT:    s_mov_b32 s13, s12
-; GFX10-NEXT:    s_and_b32 s7, s7, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX10-NEXT:    s_addc_u32 s15, s9, s12
-; GFX10-NEXT:    s_add_u32 s0, s0, s6
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX10-NEXT:    s_and_b32 s8, s7, 1
+; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX10-NEXT:    s_ashr_i32 s6, s13, 31
+; GFX10-NEXT:    s_add_u32 s0, s8, s2
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    s_mov_b32 s7, s6
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX10-NEXT:    s_addc_u32 s1, s1, s6
-; GFX10-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    s_addc_u32 s1, s9, s2
+; GFX10-NEXT:    s_add_u32 s8, s12, s6
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_mov_b32 s3, s2
+; GFX10-NEXT:    s_addc_u32 s9, s13, s6
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX10-NEXT:    s_sub_u32 s20, 0, s8
-; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX10-NEXT:    s_and_b32 s14, s14, 1
+; GFX10-NEXT:    s_and_b32 s12, s12, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    s_subb_u32 s21, 0, s9
-; GFX10-NEXT:    s_ashr_i32 s14, s11, 31
-; GFX10-NEXT:    s_xor_b64 s[18:19], s[12:13], s[6:7]
-; GFX10-NEXT:    s_ashr_i32 s16, s3, 31
+; GFX10-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX10-NEXT:    s_xor_b64 s[18:19], s[2:3], s[6:7]
+; GFX10-NEXT:    s_ashr_i32 s16, s15, 31
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    s_add_u32 s6, s10, s14
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_add_u32 s6, s10, s12
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX10-NEXT:    s_mov_b32 s17, s16
-; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX10-NEXT:    s_mov_b32 s15, s14
-; GFX10-NEXT:    s_addc_u32 s7, s11, s14
-; GFX10-NEXT:    s_add_u32 s2, s2, s16
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    s_and_b32 s10, s10, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_mov_b32 s13, s12
+; GFX10-NEXT:    s_addc_u32 s7, s11, s12
+; GFX10-NEXT:    s_add_u32 s10, s14, s16
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX10-NEXT:    s_addc_u32 s3, s3, s16
-; GFX10-NEXT:    s_xor_b64 s[10:11], s[6:7], s[14:15]
-; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX10-NEXT:    s_addc_u32 s11, s15, s16
+; GFX10-NEXT:    s_xor_b64 s[14:15], s[6:7], s[12:13]
+; GFX10-NEXT:    s_xor_b64 s[10:11], s[10:11], s[16:17]
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s2
-; GFX10-NEXT:    s_sub_u32 s6, 0, s2
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s10
+; GFX10-NEXT:    s_sub_u32 s3, 0, s10
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX10-NEXT:    s_and_b32 s7, s7, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT:    s_subb_u32 s7, 0, s3
+; GFX10-NEXT:    s_subb_u32 s6, 0, s11
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s20, v2
@@ -2102,23 +2090,23 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_add_f32_e32 v1, v9, v1
-; GFX10-NEXT:    v_add_co_u32 v5, s13, v5, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s13
-; GFX10-NEXT:    v_add_co_u32 v6, s13, v10, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s13
-; GFX10-NEXT:    v_add_co_u32 v5, s13, v5, v7
+; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v6, s7, v10, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v7
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s13
-; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v4
-; GFX10-NEXT:    v_add_co_u32 v6, s13, v6, v11
-; GFX10-NEXT:    v_mul_lo_u32 v12, s7, v1
-; GFX10-NEXT:    v_mul_hi_u32 v13, s6, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s7
+; GFX10-NEXT:    v_mul_lo_u32 v9, s3, v4
+; GFX10-NEXT:    v_add_co_u32 v6, s7, v6, v11
+; GFX10-NEXT:    v_mul_lo_u32 v12, s6, v1
+; GFX10-NEXT:    v_mul_hi_u32 v13, s3, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s13
-; GFX10-NEXT:    v_mul_lo_u32 v11, s6, v1
-; GFX10-NEXT:    v_add_co_u32 v5, s13, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s7
+; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v1
+; GFX10-NEXT:    v_add_co_u32 v5, s7, v6, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s7
 ; GFX10-NEXT:    v_add3_u32 v8, v12, v9, v13
 ; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v11
 ; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v11
@@ -2130,218 +2118,218 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v12, s21, v0
-; GFX10-NEXT:    v_add_co_u32 v6, s13, v9, v6
+; GFX10-NEXT:    v_add_co_u32 v6, s7, v9, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v13, s20, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v14, s20, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s13
-; GFX10-NEXT:    v_add_co_u32 v7, s13, v7, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s13
-; GFX10-NEXT:    v_add_co_u32 v6, s13, v6, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v7, s7, v7, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v6, s7, v6, v10
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s20, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s13
-; GFX10-NEXT:    v_add_co_u32 v5, s13, v7, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v5, s7, v7, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s7
 ; GFX10-NEXT:    v_add3_u32 v12, v12, v14, v13
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v9, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v4, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v2, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v11, v0, v12
-; GFX10-NEXT:    v_add_co_u32 v5, s13, v5, v6
+; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s7
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v13, v2, v12
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v5
-; GFX10-NEXT:    v_add_co_u32 v5, s13, v10, v11
-; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v12
+; GFX10-NEXT:    v_add_co_u32 v5, s7, v10, v11
 ; GFX10-NEXT:    v_add3_u32 v6, v7, v6, v8
-; GFX10-NEXT:    v_mul_lo_u32 v10, s7, v1
-; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s13
-; GFX10-NEXT:    v_add_co_u32 v3, s13, v13, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s7
+; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v12
+; GFX10-NEXT:    v_mul_lo_u32 v10, s6, v1
+; GFX10-NEXT:    v_add_co_u32 v5, s6, v5, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v3, s7, v13, v3
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v3, s7, v3, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_mul_hi_u32 v11, s3, v1
+; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v14
+; GFX10-NEXT:    v_mul_lo_u32 v13, s3, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s13
-; GFX10-NEXT:    v_mul_hi_u32 v11, s6, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s7
-; GFX10-NEXT:    v_mul_lo_u32 v13, s6, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s6
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v2, v12
-; GFX10-NEXT:    v_mul_lo_u32 v6, s6, v1
-; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v5
+; GFX10-NEXT:    v_mul_lo_u32 v6, s3, v1
+; GFX10-NEXT:    v_add_co_u32 v3, s3, v3, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, v8, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
 ; GFX10-NEXT:    v_add3_u32 v9, v10, v13, v11
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_add3_u32 v5, v8, v5, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v4, v6
+; GFX10-NEXT:    v_add3_u32 v5, v8, v5, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v1, v9
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v1, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX10-NEXT:    v_mul_lo_u32 v8, v4, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v3, v4, v9
+; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v9
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v0
-; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v0
+; GFX10-NEXT:    v_add_co_u32 v7, s3, v10, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v13, s0, v2
-; GFX10-NEXT:    v_add_co_u32 v7, s6, v10, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v6, s3, v8, v6
+; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v7, s6, v7, v11
 ; GFX10-NEXT:    v_mul_lo_u32 v14, s1, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v5, s6, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v7, s3, v7, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v3, s3, v6, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v5, s3, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v0, s3, v14, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v15, s0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v8, v1, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v12
-; GFX10-NEXT:    v_add_co_u32 v0, s6, v14, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v5, s3, v5, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v2
-; GFX10-NEXT:    v_add_co_u32 v0, s6, v0, v15
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v11, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v0, s6, v0, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v12, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v8, v6
+; GFX10-NEXT:    v_add_co_u32 v0, s3, v0, v15
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v11, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s3
 ; GFX10-NEXT:    v_mul_hi_u32 v9, v4, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v8
-; GFX10-NEXT:    v_mul_hi_u32 v8, s8, v0
-; GFX10-NEXT:    v_add3_u32 v2, v11, v5, v2
-; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
-; GFX10-NEXT:    v_mul_lo_u32 v7, s9, v0
-; GFX10-NEXT:    v_mul_lo_u32 v10, s8, v2
-; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
-; GFX10-NEXT:    v_add3_u32 v5, v6, v5, v9
-; GFX10-NEXT:    v_mul_lo_u32 v6, s8, v0
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v4, v7, v10, v8
-; GFX10-NEXT:    v_mul_lo_u32 v5, s11, v1
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, s0, v6
-; GFX10-NEXT:    v_mul_lo_u32 v14, s10, v3
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s1, v4
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v4, s0, s1, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v6
-; GFX10-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_mul_lo_u32 v15, s11, v3
-; GFX10-NEXT:    v_mul_hi_u32 v1, s10, v1
-; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v6, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v4
-; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v12
+; GFX10-NEXT:    v_add_co_u32 v0, s3, v0, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v13, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v3, s3, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s3
+; GFX10-NEXT:    v_add3_u32 v2, v8, v5, v2
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
+; GFX10-NEXT:    v_add3_u32 v5, v6, v7, v9
+; GFX10-NEXT:    v_mul_lo_u32 v6, s9, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s8, v0
+; GFX10-NEXT:    v_mul_lo_u32 v9, s8, v2
+; GFX10-NEXT:    v_mul_lo_u32 v3, s8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v5, s15, v1
+; GFX10-NEXT:    v_mul_hi_u32 v10, s15, v1
+; GFX10-NEXT:    v_mul_hi_u32 v1, s14, v1
+; GFX10-NEXT:    v_mul_hi_u32 v17, s14, v4
+; GFX10-NEXT:    v_add3_u32 v6, v6, v9, v7
+; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, s0, v3
+; GFX10-NEXT:    v_mul_lo_u32 v7, s14, v4
+; GFX10-NEXT:    v_mul_lo_u32 v9, s15, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v11, s1, v6
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v6, s0, s1, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v3
+; GFX10-NEXT:    v_mul_hi_u32 v4, s15, v4
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v3, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v13, v12, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v15, v7
+; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v10
 ; GFX10-NEXT:    v_add_co_u32 v1, s1, v5, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v17, s0, v0, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v14, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v13
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v7, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v15
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v13, s0
+; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v14, s0, v17, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v18, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_add3_u32 v3, v5, v1, v3
-; GFX10-NEXT:    v_sub_co_u32 v1, s0, v12, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v5, s0, 0, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v17, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v18, v15, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v15, s3, v7
-; GFX10-NEXT:    v_mul_lo_u32 v16, s2, v3
-; GFX10-NEXT:    v_mul_hi_u32 v17, s2, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_mul_lo_u32 v10, s2, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v8, v15, v16, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, s0, s10, v10
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s1, s11, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s11, v8
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v11
+; GFX10-NEXT:    v_add_co_u32 v10, s0, v17, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v13, s0, 0, v18, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_add3_u32 v4, v5, v1, v4
+; GFX10-NEXT:    v_sub_co_u32 v1, s0, v14, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v5, s0, 0, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v18, v13, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v13, s11, v9
+; GFX10-NEXT:    v_mul_lo_u32 v16, s10, v4
+; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v9
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
+; GFX10-NEXT:    v_mul_lo_u32 v7, s10, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v14, v1, s0
+; GFX10-NEXT:    v_add3_u32 v10, v13, v16, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v15, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v7, s0, s14, v7
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s1, s15, v10, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s15, v10
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v11
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s18, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s19, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, vcc_lo, s3, v1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v10
+; GFX10-NEXT:    v_xor_b32_e32 v5, s2, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, vcc_lo, s11, v1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v10, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v8, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v7, s10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v10, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s18
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v11
-; GFX10-NEXT:    v_xor_b32_e32 v2, s12, v6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s3, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v14
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v11
+; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v3
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v12, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v13
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v15, s0, v7, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v3, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v14
+; GFX10-NEXT:    v_add_co_u32 v15, s0, v9, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v4, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
 ; GFX10-NEXT:    v_add_co_u32 v12, s0, v15, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_sub_co_u32 v6, s0, v13, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
+; GFX10-NEXT:    v_sub_co_u32 v6, s0, v13, s10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v8, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v8, s12, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v4, v15, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v11, v6, s0
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[14:15], s[16:17]
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v2, s12
-; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v7
-; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v3
-; GFX10-NEXT:    v_xor_b32_e32 v7, s14, v10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v8, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v8, s14, v6
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[16:17]
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v2, s2
+; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v9
+; GFX10-NEXT:    v_xor_b32_e32 v7, s1, v10
+; GFX10-NEXT:    v_xor_b32_e32 v9, s12, v3
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v5, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v10, s12, v6
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s14
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s14, v8, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v9, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v10, vcc_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, <2 x i64> addrspace(1)* %out0
@@ -2909,63 +2897,62 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out0, i16 addrspace(1)* %
 define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %x, <2 x i16> %y) {
 ; GFX8-LABEL: sdivrem_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x14
-; GFX8-NEXT:    s_load_dword s8, s[4:5], 0x10
-; GFX8-NEXT:    s_mov_b32 s9, 0x100010
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX8-NEXT:    s_mov_b32 s10, 0x100010
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i16 s1, s0
-; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX8-NEXT:    s_add_i32 s1, s1, s2
-; GFX8-NEXT:    s_xor_b32 s3, s1, s2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX8-NEXT:    s_sub_i32 s6, 0, s3
-; GFX8-NEXT:    s_sext_i32_i16 s1, s8
-; GFX8-NEXT:    s_bfe_i32 s0, s0, s9
+; GFX8-NEXT:    s_sext_i32_i16 s0, s3
+; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
+; GFX8-NEXT:    s_add_i32 s0, s0, s8
+; GFX8-NEXT:    s_xor_b32 s9, s0, s8
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX8-NEXT:    s_sub_i32 s6, 0, s9
+; GFX8-NEXT:    s_sext_i32_i16 s0, s2
+; GFX8-NEXT:    s_bfe_i32 s1, s3, s10
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_ashr_i32 s10, s1, 31
-; GFX8-NEXT:    s_ashr_i32 s11, s0, 31
-; GFX8-NEXT:    s_add_i32 s1, s1, s10
+; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
+; GFX8-NEXT:    s_ashr_i32 s11, s1, 31
+; GFX8-NEXT:    s_add_i32 s0, s0, s3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_add_i32 s0, s0, s11
-; GFX8-NEXT:    s_xor_b32 s1, s1, s10
-; GFX8-NEXT:    s_xor_b32 s12, s0, s11
+; GFX8-NEXT:    s_add_i32 s1, s1, s11
+; GFX8-NEXT:    s_xor_b32 s0, s0, s3
+; GFX8-NEXT:    s_xor_b32 s12, s1, s11
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s12
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v2
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s9
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s9, v2
 ; GFX8-NEXT:    s_sub_i32 s1, 0, s12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v1
-; GFX8-NEXT:    s_bfe_i32 s1, s8, s9
-; GFX8-NEXT:    s_xor_b32 s0, s10, s2
+; GFX8-NEXT:    s_bfe_i32 s1, s2, s10
 ; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s2
+; GFX8-NEXT:    s_xor_b32 s0, s3, s8
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX8-NEXT:    v_xor_b32_e32 v2, s10, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, s3, v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s12
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
@@ -3000,106 +2987,104 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ;
 ; GFX9-LABEL: sdivrem_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x14
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX9-NEXT:    s_mov_b32 s10, 0x100010
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s0, s6
-; GFX9-NEXT:    s_ashr_i32 s7, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s7
-; GFX9-NEXT:    s_xor_b32 s8, s0, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x10
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s4, 0x100010
-; GFX9-NEXT:    s_bfe_i32 s6, s6, s4
+; GFX9-NEXT:    s_sext_i32_i16 s0, s7
+; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s8
+; GFX9-NEXT:    s_xor_b32 s9, s0, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT:    s_bfe_i32 s1, s7, s10
+; GFX9-NEXT:    s_ashr_i32 s7, s1, 31
+; GFX9-NEXT:    s_add_i32 s1, s1, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_ashr_i32 s10, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s10
-; GFX9-NEXT:    s_sub_i32 s11, 0, s8
+; GFX9-NEXT:    s_xor_b32 s11, s1, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX9-NEXT:    s_sub_i32 s1, 0, s9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s6, s6, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s5, s9
-; GFX9-NEXT:    v_mul_lo_u32 v1, s11, v0
-; GFX9-NEXT:    s_ashr_i32 s11, s5, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    s_xor_b32 s5, s5, s11
-; GFX9-NEXT:    s_bfe_i32 s4, s9, s4
-; GFX9-NEXT:    s_sub_i32 s9, 0, s6
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    s_sext_i32_i16 s0, s6
+; GFX9-NEXT:    s_ashr_i32 s12, s0, 31
+; GFX9-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX9-NEXT:    s_xor_b32 s7, s11, s7
-; GFX9-NEXT:    v_mul_lo_u32 v3, s9, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT:    s_add_i32 s0, s0, s12
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    s_xor_b32 s13, s0, s12
+; GFX9-NEXT:    s_sub_i32 s0, 0, s11
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_bfe_i32 s4, s6, s10
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s9
+; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX9-NEXT:    s_ashr_i32 s5, s4, 31
 ; GFX9-NEXT:    s_add_i32 s4, s4, s5
+; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s5
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s9, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s9, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    s_xor_b32 s6, s12, s8
+; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    s_xor_b32 s4, s5, s10
-; GFX9-NEXT:    v_xor_b32_e32 v0, s7, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, s11, v2
+; GFX9-NEXT:    s_xor_b32 s4, s5, s7
+; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s11, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s12, v2
 ; GFX9-NEXT:    v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v4, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x14
-; GFX10-NEXT:    s_mov_b32 s1, 0x100010
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX10-NEXT:    s_mov_b32 s2, 0x100010
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s2, s0
-; GFX10-NEXT:    s_bfe_i32 s0, s0, s1
-; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX10-NEXT:    s_ashr_i32 s8, s0, 31
-; GFX10-NEXT:    s_add_i32 s2, s2, s3
-; GFX10-NEXT:    s_add_i32 s0, s0, s8
-; GFX10-NEXT:    s_xor_b32 s2, s2, s3
-; GFX10-NEXT:    s_xor_b32 s9, s0, s8
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
-; GFX10-NEXT:    s_sub_i32 s6, 0, s2
-; GFX10-NEXT:    s_sub_i32 s7, 0, s9
+; GFX10-NEXT:    s_sext_i32_i16 s3, s1
+; GFX10-NEXT:    s_bfe_i32 s1, s1, s2
+; GFX10-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX10-NEXT:    s_ashr_i32 s9, s1, 31
+; GFX10-NEXT:    s_add_i32 s3, s3, s8
+; GFX10-NEXT:    s_add_i32 s1, s1, s9
+; GFX10-NEXT:    s_xor_b32 s3, s3, s8
+; GFX10-NEXT:    s_xor_b32 s1, s1, s9
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX10-NEXT:    s_sub_i32 s6, 0, s3
+; GFX10-NEXT:    s_sub_i32 s7, 0, s1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -3108,56 +3093,55 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sext_i32_i16 s6, s0
-; GFX10-NEXT:    s_bfe_i32 s0, s0, s1
-; GFX10-NEXT:    s_ashr_i32 s1, s6, 31
+; GFX10-NEXT:    s_bfe_i32 s0, s0, s2
+; GFX10-NEXT:    s_ashr_i32 s2, s6, 31
 ; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
-; GFX10-NEXT:    s_add_i32 s6, s6, s1
+; GFX10-NEXT:    s_add_i32 s6, s6, s2
 ; GFX10-NEXT:    s_add_i32 s0, s0, s10
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX10-NEXT:    s_xor_b32 s6, s6, s1
+; GFX10-NEXT:    s_xor_b32 s6, s6, s2
 ; GFX10-NEXT:    s_xor_b32 s0, s0, s10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s9
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s3, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
-; GFX10-NEXT:    s_xor_b32 s2, s1, s3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s3, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
+; GFX10-NEXT:    s_xor_b32 s1, s2, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s10, s8
-; GFX10-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GFX10-NEXT:    s_xor_b32 s0, s10, s9
+; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v2, s1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s10, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s1, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 5bd27e6655054..106ca06a30191 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -555,13 +555,12 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32> addrspace(1)* %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: udivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x18
-; GFX8-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x10
+; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX8-NEXT:    s_sub_i32 s0, 0, s2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX8-NEXT:    s_sub_i32 s0, 0, s10
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -569,7 +568,7 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX8-NEXT:    s_sub_i32 s0, 0, s3
+; GFX8-NEXT:    s_sub_i32 s0, 0, s11
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
@@ -577,29 +576,29 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s10
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX8-NEXT:    v_mul_lo_u32 v4, v1, s11
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s2, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s2, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v4
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s5
@@ -611,26 +610,24 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ;
 ; GFX9-LABEL: udivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x18
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_sub_i32 s0, 0, s2
-; GFX9-NEXT:    s_sub_i32 s1, 0, s3
+; GFX9-NEXT:    s_sub_i32 s6, 0, s2
+; GFX9-NEXT:    s_sub_i32 s7, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s1, v1
@@ -652,41 +649,40 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v2
+; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x18
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX10-NEXT:    s_sub_i32 s0, 0, s2
-; GFX10-NEXT:    s_sub_i32 s1, 0, s3
+; GFX10-NEXT:    s_sub_i32 s6, 0, s2
+; GFX10-NEXT:    s_sub_i32 s7, 0, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s1, v1
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
@@ -713,6 +709,7 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
@@ -726,15 +723,14 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: udivrem_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x10
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX8-NEXT:    s_sub_i32 s0, 0, s8
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, s10
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX8-NEXT:    s_sub_i32 s0, 0, s12
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, s14
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -742,78 +738,78 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v0
-; GFX8-NEXT:    s_sub_i32 s0, 0, s9
+; GFX8-NEXT:    s_sub_i32 s0, 0, s13
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
-; GFX8-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
-; GFX8-NEXT:    v_mul_hi_u32 v1, s13, v1
-; GFX8-NEXT:    v_mul_lo_u32 v3, v0, s8
+; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, v0, s12
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v1, s9
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s12, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    v_mul_lo_u32 v5, v1, s13
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s8, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s8, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s13, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v5
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v5, v6
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s9, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s13, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GFX8-NEXT:    s_sub_i32 s0, 0, s10
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
+; GFX8-NEXT:    s_sub_i32 s0, 0, s14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, s11
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, s15
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s0, v5
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v6, v2
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX8-NEXT:    v_mul_hi_u32 v7, s14, v5
-; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s9, v3
-; GFX8-NEXT:    s_sub_i32 s0, 0, s11
+; GFX8-NEXT:    v_mul_hi_u32 v7, s10, v5
+; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s13, v3
+; GFX8-NEXT:    s_sub_i32 s0, 0, s15
 ; GFX8-NEXT:    v_mul_lo_u32 v6, s0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v7, s10
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, s14
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s14, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s10, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s10, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s14, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v8, s15, v2
+; GFX8-NEXT:    v_mul_hi_u32 v8, s11, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v7
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, v8, s11
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s10, v3
+; GFX8-NEXT:    v_mul_lo_u32 v7, v8, s15
+; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s14, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s15, v7
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s11, v7
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v8
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s11, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s15, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v7
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v8
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s15, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s11, v8
+; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s15, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s4
@@ -826,120 +822,118 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ;
 ; GFX9-LABEL: udivrem_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GFX9-NEXT:    s_sub_i32 s6, 0, s0
-; GFX9-NEXT:    s_sub_i32 s7, 0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX9-NEXT:    s_sub_i32 s0, 0, s12
+; GFX9-NEXT:    s_sub_i32 s1, 0, s13
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s14
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v1
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s14
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s0
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s12
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s13
+; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s0, v4
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s12, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s0, v4
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s12, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s3
-; GFX9-NEXT:    s_sub_i32 s0, 0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v3
-; GFX9-NEXT:    s_sub_i32 s0, 0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v8, s1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v7
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s15
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v8, s13, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v2, v6, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
-; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s15
 ; GFX9-NEXT:    v_add_u32_e32 v8, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s2
+; GFX9-NEXT:    v_mul_f32_e32 v2, v7, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, s4, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
+; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v8, s13, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v7
-; GFX9-NEXT:    v_subrev_u32_e32 v8, s1, v5
-; GFX9-NEXT:    v_sub_u32_e32 v6, s10, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s14
+; GFX9-NEXT:    v_add_u32_e32 v8, 1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v2
-; GFX9-NEXT:    v_add_u32_e32 v8, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v6
+; GFX9-NEXT:    v_sub_u32_e32 v6, s10, v6
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v6
-; GFX9-NEXT:    v_mul_lo_u32 v8, v7, s3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s14, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, v7, s15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v6
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s14, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s11, v8
 ; GFX9-NEXT:    v_add_u32_e32 v8, 1, v7
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v8, s3, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v8, s15, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v7
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v8
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s3, v8
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s15, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
-; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0x4f7ffffe
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s10
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s11
-; GFX10-NEXT:    s_sub_i32 s6, 0, s8
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s14
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s15
+; GFX10-NEXT:    s_sub_i32 s0, 0, s12
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX10-NEXT:    s_sub_i32 s7, 0, s9
-; GFX10-NEXT:    s_sub_i32 s12, 0, s10
+; GFX10-NEXT:    s_sub_i32 s1, 0, s13
+; GFX10-NEXT:    s_sub_i32 s2, 0, s14
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v4
@@ -948,12 +942,11 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s7, v1
-; GFX10-NEXT:    v_mul_lo_u32 v6, s12, v2
-; GFX10-NEXT:    s_sub_i32 s6, 0, s11
-; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_mul_lo_u32 v4, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v1
+; GFX10-NEXT:    v_mul_lo_u32 v6, s2, v2
+; GFX10-NEXT:    s_sub_i32 s0, 0, s15
+; GFX10-NEXT:    v_mul_lo_u32 v7, s0, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v6
@@ -962,34 +955,34 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v7
-; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX10-NEXT:    v_mul_hi_u32 v2, s2, v2
-; GFX10-NEXT:    v_mul_hi_u32 v3, s3, v3
-; GFX10-NEXT:    v_mul_lo_u32 v4, v0, s8
-; GFX10-NEXT:    v_mul_lo_u32 v5, v1, s9
-; GFX10-NEXT:    v_mul_lo_u32 v6, v2, s10
+; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX10-NEXT:    v_mul_hi_u32 v2, s10, v2
+; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, v0, s12
+; GFX10-NEXT:    v_mul_lo_u32 v5, v1, s13
+; GFX10-NEXT:    v_mul_lo_u32 v6, v2, s14
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v7, v3, s11
+; GFX10-NEXT:    v_mul_lo_u32 v7, v3, s15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, 1, v3
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s0, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s1, v5
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s2, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s3, v7
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v4
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v5
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s10, v6
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s11, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s8, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s9, v5
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s10, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s11, v7
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s12, v4
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v5
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v6
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s15, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s8, v4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s12, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s9, v5
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s13, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s10, v6
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s14, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v12, s11, v7
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v12, s15, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s1
@@ -997,24 +990,23 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v4
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v5
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s10, v6
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s12, v4
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v5
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, 1, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s11, v7
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s15, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s8, v4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s12, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s9, v5
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s13, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s10, v6
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v13, s11, v7
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s14, v6
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v13, s15, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
@@ -1028,20 +1020,19 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64> addrspace(1)* %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: udivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x10
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX8-NEXT:    s_sub_u32 s0, 0, s8
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX8-NEXT:    s_sub_u32 s0, 0, s12
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_subb_u32 s1, 0, s9
+; GFX8-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1049,7 +1040,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_sub_u32 s2, 0, s10
+; GFX8-NEXT:    s_sub_u32 s2, 0, s14
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
@@ -1059,15 +1050,15 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v0, v4
+; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -1083,7 +1074,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s9
+; GFX8-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
@@ -1109,64 +1100,64 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX8-NEXT:    v_mul_hi_u32 v5, s12, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s13
+; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX8-NEXT:    v_mul_hi_u32 v5, s8, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v5, s13, v1
+; GFX8-NEXT:    v_mul_lo_u32 v5, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mul_hi_u32 v3, s12, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, s8, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v7, s8, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, s8, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT:    v_mul_hi_u32 v7, s12, v0
+; GFX8-NEXT:    v_mul_lo_u32 v5, s12, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s12, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v5
 ; GFX8-NEXT:    v_subb_u32_e64 v5, s[0:1], v4, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s13, v2
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v5
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v2
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v5
 ; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s12, v3
 ; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
 ; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s12, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v13, s11
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v13, s15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v12, s10
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v12, s14
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
@@ -1186,7 +1177,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s3, 0, s11
+; GFX8-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s3, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v8, s2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
@@ -1222,7 +1213,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s2, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v9, s2, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v8, s2, v2
-; GFX8-NEXT:    v_mov_b32_e32 v10, s11
+; GFX8-NEXT:    v_mov_b32_e32 v10, s15
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v3, v8
@@ -1248,51 +1239,51 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, s15, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s14, v3
-; GFX8-NEXT:    v_mul_hi_u32 v9, s14, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, s15, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, s15
+; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, s10, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v9, s15, v3
+; GFX8-NEXT:    v_mul_lo_u32 v9, s11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_mul_hi_u32 v7, s14, v3
+; GFX8-NEXT:    v_mul_hi_u32 v7, s10, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_mul_hi_u32 v3, s15, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, s11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
-; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX8-NEXT:    v_mul_hi_u32 v11, s10, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v2
+; GFX8-NEXT:    v_mul_lo_u32 v6, s15, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s14, v3
+; GFX8-NEXT:    v_mul_hi_u32 v11, s14, v2
+; GFX8-NEXT:    v_mul_lo_u32 v9, s14, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s14, v9
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s10, v9
 ; GFX8-NEXT:    v_subb_u32_e64 v8, s[0:1], v8, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s15, v6
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s11, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s10, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s14, v7
 ; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v10, vcc
@@ -1300,7 +1291,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v14
 ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s10, v11
+; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s14, v11
 ; GFX8-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
@@ -1322,19 +1313,18 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ;
 ; GFX9-LABEL: udivrem_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s13
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX9-NEXT:    s_sub_u32 s0, 0, s12
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1342,8 +1332,8 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s11
-; GFX9-NEXT:    s_sub_u32 s2, 0, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s15
+; GFX9-NEXT:    s_sub_u32 s2, 0, s14
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
@@ -1377,7 +1367,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -1401,19 +1391,19 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1422,35 +1412,35 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT:    v_mul_lo_u32 v6, s12, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s12, v6
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s8, v6
 ; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v7, v2, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_sub_u32_e32 v2, s13, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
+; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s8, v3
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s12, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[0:1], 0, v2, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s14
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_add_f32_e32 v5, v14, v5
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s8, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s12, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v9
@@ -1468,7 +1458,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s3, 0, s11
+; GFX9-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX9-NEXT:    v_mul_lo_u32 v13, s3, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v14, s2, v12
 ; GFX9-NEXT:    v_mul_hi_u32 v16, s2, v5
@@ -1530,19 +1520,19 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add3_u32 v7, v10, v7, v8
 ; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], v9, v7, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v8, s15, v3
-; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v7
+; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v3
+; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v2, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v2, s14, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s15, v3
+; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s11, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, s15, v7
+; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v7
 ; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, s14, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, s15, v7
+; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
@@ -1551,30 +1541,30 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v2
-; GFX9-NEXT:    v_mul_lo_u32 v10, s10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v11, s15
-; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    v_mul_lo_u32 v6, s15, v2
+; GFX9-NEXT:    v_mul_lo_u32 v7, s14, v3
+; GFX9-NEXT:    v_mul_hi_u32 v8, s14, v2
+; GFX9-NEXT:    v_mul_lo_u32 v10, s14, v2
+; GFX9-NEXT:    v_mov_b32_e32 v11, s11
+; GFX9-NEXT:    v_mov_b32_e32 v9, s15
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v7, v8
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s14, v10
+; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v10
 ; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v11, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
-; GFX9-NEXT:    v_sub_u32_e32 v6, s15, v6
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v8
+; GFX9-NEXT:    v_sub_u32_e32 v6, s11, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s10, v7
+; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s14, v7
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
@@ -1584,7 +1574,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s10, v11
+; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s14, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
@@ -1601,28 +1591,28 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ;
 ; GFX10-LABEL: udivrem_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s11
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s10
-; GFX10-NEXT:    s_sub_u32 s6, 0, s8
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s13
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s15
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s14
+; GFX10-NEXT:    s_sub_u32 s0, 0, s12
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_subb_u32 s7, 0, s9
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    s_sub_u32 s12, 0, s10
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, s0, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_subb_u32 s13, 0, s11
+; GFX10-NEXT:    s_sub_u32 s2, 0, s14
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
@@ -1635,16 +1625,16 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_add_f32_e32 v0, v4, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v2
-; GFX10-NEXT:    v_mul_lo_u32 v8, s12, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s0, v2
+; GFX10-NEXT:    v_mul_lo_u32 v8, s2, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v5, s7, v0
-; GFX10-NEXT:    v_mul_hi_u32 v6, s6, v0
-; GFX10-NEXT:    v_mul_lo_u32 v9, s13, v1
-; GFX10-NEXT:    v_mul_hi_u32 v10, s12, v1
-; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s12, v1
+; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GFX10-NEXT:    v_mul_hi_u32 v6, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v9, s3, v1
+; GFX10-NEXT:    v_mul_hi_u32 v10, s2, v1
+; GFX10-NEXT:    v_mul_lo_u32 v7, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v1
 ; GFX10-NEXT:    v_add3_u32 v4, v5, v4, v6
 ; GFX10-NEXT:    v_add3_u32 v8, v9, v8, v10
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v2, v7
@@ -1657,49 +1647,49 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v11
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v3, v11
 ; GFX10-NEXT:    v_mul_lo_u32 v16, v3, v8
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v12
+; GFX10-NEXT:    v_add_co_u32 v5, s6, v5, v12
 ; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v13, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v11, s0, v16, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v7, s6, v13, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v9, s6, v9, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v11, s6, v16, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v5, s6, v5, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v17, v1, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v7, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v6, s6, v7, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v9, s6, v9, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v12, v5
-; GFX10-NEXT:    v_add_co_u32 v10, s0, v11, v17
+; GFX10-NEXT:    v_add_co_u32 v10, s6, v11, v17
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, v15, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v5, s6, v6, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v13, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v8
-; GFX10-NEXT:    v_add_co_u32 v9, s0, v10, v9
+; GFX10-NEXT:    v_add_co_u32 v9, s6, v10, v9
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, v16, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
 ; GFX10-NEXT:    v_add3_u32 v4, v7, v6, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_add3_u32 v5, v11, v10, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v6, s7, v0
-; GFX10-NEXT:    v_mul_hi_u32 v7, s6, v0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s6, v2
-; GFX10-NEXT:    v_mul_lo_u32 v9, s13, v1
-; GFX10-NEXT:    v_mul_hi_u32 v10, s12, v1
-; GFX10-NEXT:    v_mul_lo_u32 v11, s12, v3
-; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s12, v1
+; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v5, s0, v2
+; GFX10-NEXT:    v_mul_lo_u32 v9, s3, v1
+; GFX10-NEXT:    v_mul_hi_u32 v10, s2, v1
+; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s2, v1
 ; GFX10-NEXT:    v_add3_u32 v5, v6, v5, v7
 ; GFX10-NEXT:    v_add3_u32 v9, v9, v11, v10
 ; GFX10-NEXT:    v_mul_lo_u32 v12, v2, v4
@@ -1711,168 +1701,166 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v1, v9
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v8
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v8
-; GFX10-NEXT:    v_add_co_u32 v10, s6, v12, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v4, s6, v11, v4
+; GFX10-NEXT:    v_add_co_u32 v10, s0, v12, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v11, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v16, v3, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v6, s6, v6, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v15
 ; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v10, s6, v10, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v8, s6, v16, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v4, s6, v4, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v10, s0, v10, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v8, s0, v16, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v14
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, v12, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v6, s6, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v4, s6, v4, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
 ; GFX10-NEXT:    v_mul_hi_u32 v17, v1, v9
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v15, v6
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v10, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v9, v3, v9
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0
-; GFX10-NEXT:    v_add_co_u32 v8, s6, v8, v17
+; GFX10-NEXT:    v_mul_hi_u32 v10, s9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0
+; GFX10-NEXT:    v_add_co_u32 v8, s0, v8, v17
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v4, s12, v8, v6
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v8, v6
+; GFX10-NEXT:    v_mul_lo_u32 v6, s9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v13, v11
-; GFX10-NEXT:    v_mul_hi_u32 v11, s1, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s12
-; GFX10-NEXT:    v_mul_lo_u32 v12, s1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s9, v2
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
-; GFX10-NEXT:    v_add_co_u32 v6, s12, v6, v8
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v8
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v5, v9
-; GFX10-NEXT:    v_mul_hi_u32 v7, s0, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s12
-; GFX10-NEXT:    v_add_co_u32 v0, s13, v6, v0
-; GFX10-NEXT:    v_add_co_u32 v9, s12, v12, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s13
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s12
-; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v2
-; GFX10-NEXT:    v_add_co_u32 v7, s12, v9, v7
+; GFX10-NEXT:    v_mul_hi_u32 v7, s8, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s1, v6, v0
+; GFX10-NEXT:    v_add_co_u32 v9, s0, v11, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_mul_hi_u32 v2, s9, v2
+; GFX10-NEXT:    v_add_co_u32 v7, s0, v9, v7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_add_co_u32 v0, s12, v7, v0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v7, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s12
-; GFX10-NEXT:    v_mul_lo_u32 v6, s3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v8, s2, v3
-; GFX10-NEXT:    v_mul_lo_u32 v9, s9, v0
-; GFX10-NEXT:    v_mul_hi_u32 v11, s8, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
+; GFX10-NEXT:    v_mul_lo_u32 v8, s10, v3
+; GFX10-NEXT:    v_mul_lo_u32 v9, s13, v0
+; GFX10-NEXT:    v_mul_hi_u32 v10, s12, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
-; GFX10-NEXT:    v_mul_hi_u32 v7, s2, v1
-; GFX10-NEXT:    v_mul_lo_u32 v13, s8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s3, v3
-; GFX10-NEXT:    v_mul_lo_u32 v12, s8, v2
-; GFX10-NEXT:    v_add_co_u32 v6, s12, v6, v8
-; GFX10-NEXT:    v_mul_hi_u32 v5, s2, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s12
-; GFX10-NEXT:    v_mul_hi_u32 v3, s3, v3
-; GFX10-NEXT:    v_add_co_u32 v1, s12, v4, v1
-; GFX10-NEXT:    v_add3_u32 v9, v9, v12, v11
-; GFX10-NEXT:    v_sub_co_u32 v11, vcc_lo, s0, v13
+; GFX10-NEXT:    v_mul_hi_u32 v7, s10, v1
+; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
+; GFX10-NEXT:    v_mul_lo_u32 v4, s11, v3
+; GFX10-NEXT:    v_mul_lo_u32 v13, s12, v0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s12, v2
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
+; GFX10-NEXT:    v_mul_hi_u32 v5, s10, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v4, v1
+; GFX10-NEXT:    v_add3_u32 v9, v9, v11, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, s8, v13
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s1, v9
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s9, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s0, s1, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v11
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v9
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s0, s9, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s13, v9
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v8, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v11, s8
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v10, s12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v7, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v9
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v9
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v13, v12, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v13, v11, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v14
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v1, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v0, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v15
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v15
 ; GFX10-NEXT:    v_add3_u32 v3, v4, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v18, s10, v6
+; GFX10-NEXT:    v_mul_hi_u32 v18, s14, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s0
-; GFX10-NEXT:    v_mul_lo_u32 v13, s11, v6
-; GFX10-NEXT:    v_mul_lo_u32 v17, s10, v3
+; GFX10-NEXT:    v_mul_lo_u32 v13, s15, v6
+; GFX10-NEXT:    v_mul_lo_u32 v17, s14, v3
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, v5, 1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v16, s0
-; GFX10-NEXT:    v_sub_co_u32 v19, s0, v14, s8
+; GFX10-NEXT:    v_sub_co_u32 v19, s0, v14, s12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v5, s10, v6
+; GFX10-NEXT:    v_mul_lo_u32 v5, s14, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v13, v13, v17, v18
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s3, v13
-; GFX10-NEXT:    v_sub_co_u32 v12, s0, s2, v5
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s3, v13, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s11, v2, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v12
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s11, v13
+; GFX10-NEXT:    v_sub_co_u32 v11, s0, s10, v5
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s11, v13, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v11
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v8
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s11, v16
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s15, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s0
-; GFX10-NEXT:    v_sub_co_u32 v13, s0, v12, s10
+; GFX10-NEXT:    v_sub_co_u32 v13, s0, v11, s14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v19, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s2, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s11, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s11, v2, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s11, v14
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s15, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s10, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s1
 ; GFX10-NEXT:    v_add_co_u32 v15, s1, v6, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s1, 0, v3, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s11, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s1
-; GFX10-NEXT:    v_add_co_u32 v11, s1, v15, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s1
+; GFX10-NEXT:    v_add_co_u32 v10, s1, v15, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s1, 0, v17, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
-; GFX10-NEXT:    v_sub_co_u32 v8, s1, v13, s10
+; GFX10-NEXT:    v_sub_co_u32 v8, s1, v13, s14
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v15, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v15, v10, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, v17, v18, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, v14, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v11, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v10, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v8, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v11, v8, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v13, s1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX10-NEXT:    global_store_dwordx4 v12, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v12, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, <2 x i64> addrspace(1)* %out0
@@ -2307,57 +2295,55 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out0, i16 addrspace(1)* %
 define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %x, <2 x i16> %y) {
 ; GFX8-LABEL: udivrem_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x14
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s6, s0, s2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX8-NEXT:    s_sub_i32 s1, 0, s6
+; GFX8-NEXT:    s_and_b32 s3, s1, s2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX8-NEXT:    s_sub_i32 s1, 0, s3
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x10
+; GFX8-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX8-NEXT:    s_and_b32 s0, s0, s2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s1, v0
-; GFX8-NEXT:    s_sub_i32 s1, 0, s3
-; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_sub_i32 s1, 0, s8
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s6
+; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s3
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s6, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s3
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v2
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s8
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s8, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s8, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2365,36 +2351,34 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_and_b32_e32 v0, s2, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v4
-; GFX8-NEXT:    v_mov_b32_e32 v0, s10
-; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udivrem_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x14
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s7, s0, s1
+; GFX9-NEXT:    s_and_b32 s7, s1, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT:    s_sub_i32 s2, 0, s7
+; GFX9-NEXT:    s_sub_i32 s1, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x10
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    s_sub_i32 s3, 0, s6
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    s_and_b32 s9, s0, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s9, s0, s1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX9-NEXT:    v_mul_lo_u32 v2, s1, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
@@ -2440,15 +2424,14 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ;
 ; GFX10-LABEL: udivrem_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x14
-; GFX10-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX10-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX10-NEXT:    s_and_b32 s3, s0, s2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX10-NEXT:    s_sub_i32 s6, 0, s1
-; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX10-NEXT:    s_and_b32 s1, s1, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX10-NEXT:    s_sub_i32 s6, 0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -2456,38 +2439,37 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
-; GFX10-NEXT:    s_sub_i32 s6, 0, s3
+; GFX10-NEXT:    s_sub_i32 s6, 0, s1
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX10-NEXT:    s_and_b32 s0, s0, s2
+; GFX10-NEXT:    s_and_b32 s0, s0, s3
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index e7a0847383b8e..741602f8f0338 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2528,18 +2528,16 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s9, s6, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX6-NEXT:    s_lshr_b32 s9, s4, 16
-; GFX6-NEXT:    s_and_b32 s4, s4, s8
+; GFX6-NEXT:    s_and_b32 s2, s6, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_and_b32 s9, s4, s8
 ; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
-; GFX6-NEXT:    s_and_b32 s6, s7, s8
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
@@ -2548,36 +2546,38 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
-; GFX6-NEXT:    s_lshr_b32 s10, s7, 16
-; GFX6-NEXT:    s_and_b32 s5, s5, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT:    s_and_b32 s4, s5, s8
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
+; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
+; GFX6-NEXT:    v_mad_f32 v3, -v1, v4, v5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mad_f32 v4, -v4, v3, v6
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
 ; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2592,49 +2592,49 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s8, s6, s0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    s_and_b32 s1, s4, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
 ; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX9-NEXT:    s_and_b32 s4, s4, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX9-NEXT:    s_lshr_b32 s4, s6, 16
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    s_and_b32 s1, s7, s0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
+; GFX9-NEXT:    s_and_b32 s1, s7, s0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GFX9-NEXT:    v_mad_f32 v4, -v1, v3, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
 ; GFX9-NEXT:    s_and_b32 s0, s5, s0
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX9-NEXT:    s_lshr_b32 s8, s5, 16
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mad_f32 v6, -v1, v5, v6
+; GFX9-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX9-NEXT:    v_mad_f32 v4, -v1, v5, v6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v4
+; GFX9-NEXT:    v_mad_f32 v4, -v4, v6, v7
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
@@ -2745,65 +2745,69 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s9, s6, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX6-NEXT:    s_and_b32 s10, s4, s8
-; GFX6-NEXT:    s_lshr_b32 s11, s6, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
+; GFX6-NEXT:    s_and_b32 s2, s6, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s6
+; GFX6-NEXT:    v_alignbit_b32 v4, s7, v4, 16
+; GFX6-NEXT:    s_and_b32 s9, s4, s8
+; GFX6-NEXT:    v_and_b32_e32 v5, s8, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s11
-; GFX6-NEXT:    s_lshr_b32 s9, s4, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
+; GFX6-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NEXT:    v_alignbit_b32 v3, s5, v3, 16
+; GFX6-NEXT:    v_and_b32_e32 v6, s8, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, v6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
+; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
-; GFX6-NEXT:    v_mad_f32 v1, -v1, v3, v4
+; GFX6-NEXT:    v_mad_f32 v1, -v1, v5, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v5
 ; GFX6-NEXT:    s_and_b32 s6, s7, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
 ; GFX6-NEXT:    s_and_b32 s6, s5, s8
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT:    s_lshr_b32 s12, s7, 16
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s12
-; GFX6-NEXT:    s_lshr_b32 s10, s5, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s10
-; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
+; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s6
+; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX6-NEXT:    v_mad_f32 v4, -v1, v2, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v6
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_mad_f32 v2, -v2, v5, v6
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s12
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
 ; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -2817,56 +2821,56 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s8, s6, s0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    s_and_b32 s9, s4, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT:    s_lshr_b32 s9, s6, 16
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s9
-; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX9-NEXT:    s_and_b32 s1, s4, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    s_and_b32 s6, s7, s0
-; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GFX9-NEXT:    s_and_b32 s8, s7, s0
+; GFX9-NEXT:    v_mad_f32 v4, -v1, v3, v5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GFX9-NEXT:    s_and_b32 s0, s5, s0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
-; GFX9-NEXT:    s_lshr_b32 s8, s5, 16
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mad_f32 v6, -v3, v5, v6
+; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
+; GFX9-NEXT:    v_mad_f32 v4, -v3, v5, v6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
+; GFX9-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
+; GFX9-NEXT:    v_sub_u32_e32 v0, s1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s10
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_sub_u32_e32 v5, s1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v3
-; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v4
+; GFX9-NEXT:    v_mad_f32 v4, -v4, v6, v7
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v6
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s8
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
+; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, s0, v3
+; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
@@ -3274,70 +3278,75 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
+; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
+; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
+; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
+; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
+; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GFX6-NEXT:    s_xor_b32 s8, s4, s6
-; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
-; GFX6-NEXT:    s_or_b32 s8, s8, 1
-; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, s8
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
-; GFX6-NEXT:    s_sext_i32_i16 s6, s7
+; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    s_sext_i32_i16 s4, s7
+; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX6-NEXT:    s_sext_i32_i16 s6, s5
+; GFX6-NEXT:    s_xor_b32 s4, s6, s4
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v1
-; GFX6-NEXT:    s_sext_i32_i16 s4, s5
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT:    s_xor_b32 s4, s4, s6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v3
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s4, s4, 1
-; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
-; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s4
-; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
+; GFX6-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX6-NEXT:    v_mad_f32 v2, -v4, v3, v2
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
-; GFX6-NEXT:    s_ashr_i32 s6, s5, 16
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    s_xor_b32 s7, s6, s4
-; GFX6-NEXT:    s_ashr_i32 s7, s7, 30
+; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v3|
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GFX6-NEXT:    s_lshr_b32 s6, s7, 16
+; GFX6-NEXT:    s_ashr_i32 s7, s5, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s7
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    s_xor_b32 s4, s7, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
+; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    s_or_b32 s7, s7, 1
-; GFX6-NEXT:    v_mov_b32_e32 v6, s7
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v6, s4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s6
+; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -3347,78 +3356,78 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s0, s6
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9-NEXT:    s_sext_i32_i16 s1, s4
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GFX9-NEXT:    s_xor_b32 s0, s1, s0
+; GFX9-NEXT:    s_sext_i32_i16 s8, s6
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GFX9-NEXT:    s_sext_i32_i16 s9, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
+; GFX9-NEXT:    s_xor_b32 s0, s9, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s8, s0, 1
+; GFX9-NEXT:    s_or_b32 s10, s0, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
-; GFX9-NEXT:    s_ashr_i32 s9, s6, 16
+; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
+; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s9
-; GFX9-NEXT:    s_ashr_i32 s8, s4, 16
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX9-NEXT:    v_add_u32_e32 v1, s0, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s8
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_xor_b32 s0, s8, s9
+; GFX9-NEXT:    s_xor_b32 s0, s4, s6
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    s_or_b32 s6, s0, 1
+; GFX9-NEXT:    s_or_b32 s8, s0, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
+; GFX9-NEXT:    s_sext_i32_i16 s8, s7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s8
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v4
-; GFX9-NEXT:    s_sext_i32_i16 s0, s7
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
-; GFX9-NEXT:    s_sext_i32_i16 s1, s5
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
-; GFX9-NEXT:    s_xor_b32 s0, s1, s0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GFX9-NEXT:    s_sext_i32_i16 s6, s5
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX9-NEXT:    s_xor_b32 s0, s6, s8
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s6, s0, 1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GFX9-NEXT:    s_or_b32 s10, s0, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
-; GFX9-NEXT:    s_ashr_i32 s6, s7, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s6
+; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
+; GFX9-NEXT:    s_ashr_i32 s7, s7, 16
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s7
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
-; GFX9-NEXT:    s_ashr_i32 s7, s5, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    s_xor_b32 s0, s7, s6
+; GFX9-NEXT:    s_xor_b32 s0, s5, s7
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s9, s0, 1
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s8
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
+; GFX9-NEXT:    s_or_b32 s8, s0, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s9, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
 ; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
-; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v3
-; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s7
+; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, s6, v3
+; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
@@ -3820,21 +3829,20 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ;
 ; GFX6-LABEL: udiv_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s6, s2, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_and_b32 s6, s0, s8
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX6-NEXT:    s_and_b32 s2, s6, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_and_b32 s9, s4, s8
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
@@ -3843,16 +3851,16 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s0, s3, s8
+; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
-; GFX6-NEXT:    s_and_b32 s0, s1, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT:    s_and_b32 s4, s5, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
@@ -3863,55 +3871,54 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX9-NEXT:    s_and_b32 s1, s6, s0
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    s_and_b32 s8, s6, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    s_and_b32 s1, s4, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
-; GFX9-NEXT:    s_lshr_b32 s1, s6, 16
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_mad_f32 v2, -v4, v1, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    s_and_b32 s1, s3, s0
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
+; GFX9-NEXT:    s_and_b32 s1, s7, s0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GFX9-NEXT:    v_mad_f32 v4, -v2, v3, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    s_and_b32 s0, s7, s0
+; GFX9-NEXT:    s_and_b32 s0, s5, s0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v6
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    global_store_short v1, v3, s[4:5] offset:4
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    global_store_short v0, v3, s[2:3] offset:4
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
@@ -3991,114 +3998,112 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ;
 ; GFX6-LABEL: urem_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s6, s2, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_and_b32 s6, s0, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT:    v_alignbit_b32 v4, s3, v4, 16
-; GFX6-NEXT:    v_and_b32_e32 v5, s8, v4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v5
+; GFX6-NEXT:    s_and_b32 s9, s6, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX6-NEXT:    s_and_b32 s2, s4, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
+; GFX6-NEXT:    v_and_b32_e32 v5, s8, v2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
+; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v4
+; GFX6-NEXT:    v_mad_f32 v3, -v4, v0, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 16
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
 ; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s3, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
-; GFX6-NEXT:    s_and_b32 s0, s1, s8
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    s_and_b32 s4, s7, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
+; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX6-NEXT:    v_mad_f32 v3, -v4, v5, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX6-NEXT:    s_and_b32 s4, s5, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX6-NEXT:    v_mad_f32 v3, -v5, v2, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
 ; GFX6-NEXT:    v_mad_f32 v3, -v3, v6, v7
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_and_b32 s8, s4, s0
+; GFX9-NEXT:    s_and_b32 s8, s6, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    s_and_b32 s3, s4, s2
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v4, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    s_and_b32 s5, s5, s0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX9-NEXT:    v_mad_f32 v3, -v1, v2, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s5
-; GFX9-NEXT:    s_and_b32 s0, s3, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    s_and_b32 s7, s7, s2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v4
+; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s7
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX9-NEXT:    s_and_b32 s2, s5, s2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s4
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s5
-; GFX9-NEXT:    v_sub_u32_e32 v0, s1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GFX9-NEXT:    v_sub_u32_e32 v0, s3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s0, v2
+; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
@@ -4184,19 +4189,18 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ;
 ; GFX6-LABEL: sdiv_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s8, s2
+; GFX6-NEXT:    s_sext_i32_i16 s8, s6
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
-; GFX6-NEXT:    s_sext_i32_i16 s9, s0
+; GFX6-NEXT:    s_sext_i32_i16 s9, s4
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX6-NEXT:    s_xor_b32 s8, s9, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
 ; GFX6-NEXT:    s_or_b32 s8, s8, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -4204,55 +4208,54 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GFX6-NEXT:    s_xor_b32 s0, s0, s2
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX6-NEXT:    s_or_b32 s0, s0, 1
+; GFX6-NEXT:    s_xor_b32 s4, s4, s6
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
-; GFX6-NEXT:    v_mov_b32_e32 v4, s0
-; GFX6-NEXT:    s_sext_i32_i16 s0, s3
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    s_sext_i32_i16 s4, s7
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX6-NEXT:    s_sext_i32_i16 s1, s1
+; GFX6-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT:    s_xor_b32 s0, s1, s0
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX6-NEXT:    s_or_b32 s0, s0, 1
+; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, s0
+; GFX6-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-NEXT:    s_sext_i32_i16 s0, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GFX9-NEXT:    s_sext_i32_i16 s1, s4
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
@@ -4266,44 +4269,44 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
-; GFX9-NEXT:    s_ashr_i32 s1, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GFX9-NEXT:    s_ashr_i32 s2, s4, 16
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX9-NEXT:    v_add_u32_e32 v2, s0, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_xor_b32 s0, s2, s1
+; GFX9-NEXT:    s_xor_b32 s0, s4, s1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s2, s0, 1
+; GFX9-NEXT:    s_or_b32 s4, s0, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-NEXT:    s_sext_i32_i16 s1, s7
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
 ; GFX9-NEXT:    s_sext_i32_i16 s0, s5
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s2, s0, 1
+; GFX9-NEXT:    s_or_b32 s4, s0, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT:    global_store_short v1, v0, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v1, v2, s[6:7]
+; GFX9-NEXT:    global_store_short v1, v0, s[2:3] offset:4
+; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
@@ -4395,51 +4398,51 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ;
 ; GFX6-LABEL: srem_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s2, s4
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX6-NEXT:    s_sext_i32_i16 s8, s6
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s8
-; GFX6-NEXT:    s_xor_b32 s2, s8, s2
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GFX6-NEXT:    s_sext_i32_i16 s9, s4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
+; GFX6-NEXT:    s_xor_b32 s8, s9, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
-; GFX6-NEXT:    s_or_b32 s2, s2, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
+; GFX6-NEXT:    s_or_b32 s8, s8, 1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 16
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
 ; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
-; GFX6-NEXT:    v_alignbit_b32 v1, s7, v1, 16
+; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
 ; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
+; GFX6-NEXT:    s_sext_i32_i16 s4, s7
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
 ; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    s_sext_i32_i16 s6, s7
+; GFX6-NEXT:    s_sext_i32_i16 s6, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v4
@@ -4454,11 +4457,10 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s5
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
@@ -4467,66 +4469,65 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ;
 ; GFX9-LABEL: srem_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s8, s2
+; GFX9-NEXT:    s_sext_i32_i16 s8, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
 ; GFX9-NEXT:    s_sext_i32_i16 s9, s4
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
-; GFX9-NEXT:    s_xor_b32 s6, s9, s8
+; GFX9-NEXT:    s_xor_b32 s2, s9, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX9-NEXT:    s_ashr_i32 s6, s6, 30
-; GFX9-NEXT:    s_or_b32 s10, s6, 1
-; GFX9-NEXT:    s_sext_i32_i16 s5, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s10, s2, 1
+; GFX9-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
-; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT:    s_cselect_b32 s6, s10, 0
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s10, 0
+; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s6
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v1, s6, v2
+; GFX9-NEXT:    s_sext_i32_i16 s5, s5
+; GFX9-NEXT:    v_add_u32_e32 v1, s2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT:    s_xor_b32 s6, s4, s2
-; GFX9-NEXT:    s_ashr_i32 s6, s6, 30
+; GFX9-NEXT:    s_xor_b32 s2, s4, s6
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX9-NEXT:    s_or_b32 s8, s2, 1
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    s_or_b32 s8, s6, 1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
-; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT:    s_cselect_b32 s6, s8, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s6, v3
-; GFX9-NEXT:    s_sext_i32_i16 s6, s3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s6
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s7
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s8, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT:    s_xor_b32 s2, s5, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX9-NEXT:    s_xor_b32 s2, s5, s7
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
-; GFX9-NEXT:    s_or_b32 s7, s2, 1
-; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    s_or_b32 s6, s2, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
 ; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT:    s_cselect_b32 s2, s7, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
 ; GFX9-NEXT:    v_add_u32_e32 v2, s2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s6
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
@@ -5680,109 +5681,106 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX6-NEXT:    s_movk_i32 s4, 0x1000
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GFX6-NEXT:    s_movk_i32 s2, 0x1000
+; GFX6-NEXT:    s_mov_b32 s0, 0x4f7ffffe
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s8, s4, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    s_lshl_b32 s9, s4, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GFX6-NEXT:    s_lshl_b32 s3, s2, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_lshl_b32 s2, s2, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s0, 0x4f7ffffe
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
-; GFX6-NEXT:    s_sub_i32 s0, 0, s8
+; GFX6-NEXT:    s_sub_i32 s0, 0, s3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX6-NEXT:    s_sub_i32 s0, 0, s9
+; GFX6-NEXT:    s_sub_i32 s0, 0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, s3, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
+; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v4
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_movk_i32 s4, 0x1000
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_movk_i32 s2, 0x1000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s5, s4, s3
-; GFX9-NEXT:    s_lshl_b32 s4, s4, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b32 s6, s2, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_lshl_b32 s7, s2, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
-; GFX9-NEXT:    s_sub_i32 s3, 0, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s3, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
-; GFX9-NEXT:    s_sub_i32 s2, 0, s4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s7
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
+; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s6, v3
+; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v4
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v4
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
@@ -6020,18 +6018,18 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX6-NEXT:    s_movk_i32 s4, 0x1000
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GFX6-NEXT:    s_movk_i32 s2, 0x1000
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s6, s4, s2
+; GFX6-NEXT:    s_lshl_b32 s6, s2, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_lshl_b32 s7, s4, s3
+; GFX6-NEXT:    s_lshl_b32 s7, s2, s7
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
@@ -6039,13 +6037,11 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s7
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
@@ -6070,49 +6066,47 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ;
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_movk_i32 s4, 0x1000
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_movk_i32 s2, 0x1000
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s5, s4, s3
-; GFX9-NEXT:    s_lshl_b32 s4, s4, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
-; GFX9-NEXT:    s_sub_i32 s3, 0, s5
+; GFX9-NEXT:    s_lshl_b32 s3, s2, s7
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    s_mov_b32 s6, 0x4f7ffffe
+; GFX9-NEXT:    s_sub_i32 s7, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s2, 0, s4
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    s_sub_i32 s6, 0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s3, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -6492,133 +6486,130 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX6-NEXT:    s_movk_i32 s10, 0x1000
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GFX6-NEXT:    s_movk_i32 s2, 0x1000
 ; GFX6-NEXT:    s_mov_b32 s12, 0x4f7ffffe
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, s10, s2
-; GFX6-NEXT:    s_ashr_i32 s11, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s11
-; GFX6-NEXT:    s_xor_b32 s2, s2, s11
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_lshl_b32 s0, s10, s3
-; GFX6-NEXT:    s_sub_i32 s10, 0, s2
-; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
+; GFX6-NEXT:    s_lshl_b32 s3, s2, s6
+; GFX6-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX6-NEXT:    s_add_i32 s3, s3, s6
+; GFX6-NEXT:    s_xor_b32 s3, s3, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_lshl_b32 s0, s2, s7
+; GFX6-NEXT:    s_sub_i32 s7, 0, s3
+; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_add_i32 s0, s0, s3
-; GFX6-NEXT:    s_ashr_i32 s1, s8, 31
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_add_i32 s0, s0, s2
+; GFX6-NEXT:    s_ashr_i32 s1, s4, 31
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s12, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s10, v0
-; GFX6-NEXT:    s_xor_b32 s10, s0, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
-; GFX6-NEXT:    s_add_i32 s0, s8, s1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v0
+; GFX6-NEXT:    s_xor_b32 s7, s0, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s7
+; GFX6-NEXT:    s_add_i32 s0, s4, s1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    s_xor_b32 s8, s1, s11
+; GFX6-NEXT:    s_xor_b32 s4, s1, s6
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s12, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX6-NEXT:    s_sub_i32 s0, 0, s10
+; GFX6-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
-; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
-; GFX6-NEXT:    s_add_i32 s1, s9, s0
+; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX6-NEXT:    s_add_i32 s1, s5, s0
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GFX6-NEXT:    s_xor_b32 s2, s0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT:    s_xor_b32 s2, s0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
+; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s7
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_movk_i32 s8, 0x1000
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX9-NEXT:    s_mov_b32 s10, 0x4f7ffffe
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s2, s8, s2
-; GFX9-NEXT:    s_ashr_i32 s9, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s9
-; GFX9-NEXT:    s_xor_b32 s2, s2, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    s_lshl_b32 s0, s8, s3
-; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_lshl_b32 s1, s0, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX9-NEXT:    s_add_i32 s1, s1, s6
+; GFX9-NEXT:    s_xor_b32 s1, s1, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX9-NEXT:    s_sub_i32 s3, 0, s2
+; GFX9-NEXT:    s_sub_i32 s9, 0, s1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s10, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s8, 0, s0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
+; GFX9-NEXT:    s_add_i32 s4, s4, s7
+; GFX9-NEXT:    v_mul_lo_u32 v3, s9, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GFX9-NEXT:    s_ashr_i32 s3, s6, 31
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    s_xor_b32 s4, s4, s7
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    s_add_i32 s6, s6, s3
-; GFX9-NEXT:    s_xor_b32 s6, s6, s3
-; GFX9-NEXT:    s_xor_b32 s3, s3, s9
+; GFX9-NEXT:    s_sub_i32 s10, 0, s0
+; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
+; GFX9-NEXT:    s_add_i32 s5, s5, s9
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
-; GFX9-NEXT:    s_xor_b32 s1, s8, s1
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
+; GFX9-NEXT:    s_xor_b32 s6, s7, s6
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v4, s6, v4
-; GFX9-NEXT:    s_add_i32 s6, s7, s8
-; GFX9-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v4
+; GFX9-NEXT:    s_xor_b32 s4, s5, s9
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
-; GFX9-NEXT:    v_mul_hi_u32 v1, s6, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v4
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s1, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s0
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_xor_b32_e32 v0, s3, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
+; GFX9-NEXT:    s_xor_b32 s1, s9, s8
+; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s0, v3
@@ -6627,8 +6618,9 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s1, v1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
@@ -6951,120 +6943,117 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX6-NEXT:    s_movk_i32 s6, 0x1000
-; GFX6-NEXT:    s_mov_b32 s10, 0x4f7ffffe
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GFX6-NEXT:    s_movk_i32 s8, 0x1000
+; GFX6-NEXT:    s_mov_b32 s11, 0x4f7ffffe
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, s6, s2
-; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s4
-; GFX6-NEXT:    s_xor_b32 s2, s2, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_lshl_b32 s3, s6, s3
-; GFX6-NEXT:    s_ashr_i32 s6, s3, 31
-; GFX6-NEXT:    s_add_i32 s3, s3, s6
+; GFX6-NEXT:    s_lshl_b32 s2, s8, s6
+; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX6-NEXT:    s_add_i32 s2, s2, s3
+; GFX6-NEXT:    s_xor_b32 s6, s2, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_lshl_b32 s7, s8, s7
+; GFX6-NEXT:    s_ashr_i32 s9, s7, 31
+; GFX6-NEXT:    s_add_i32 s7, s7, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s9, 0, s2
-; GFX6-NEXT:    s_xor_b32 s3, s3, s6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
-; GFX6-NEXT:    v_mul_f32_e32 v0, s10, v0
+; GFX6-NEXT:    s_sub_i32 s10, 0, s6
+; GFX6-NEXT:    s_xor_b32 s7, s7, s9
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s7
+; GFX6-NEXT:    v_mul_f32_e32 v0, s11, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v0
-; GFX6-NEXT:    s_sub_i32 s9, 0, s3
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s8, s0, 31
-; GFX6-NEXT:    s_add_i32 s0, s0, s8
+; GFX6-NEXT:    s_add_i32 s4, s4, s8
+; GFX6-NEXT:    v_mul_lo_u32 v1, s10, v0
+; GFX6-NEXT:    s_xor_b32 s4, s4, s8
+; GFX6-NEXT:    s_sub_i32 s10, 0, s7
+; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    s_xor_b32 s0, s0, s8
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s10, v2
+; GFX6-NEXT:    v_mul_f32_e32 v1, s11, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
-; GFX6-NEXT:    s_ashr_i32 s0, s1, 31
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX6-NEXT:    s_add_i32 s1, s1, s0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; GFX6-NEXT:    s_add_i32 s4, s5, s9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
+; GFX6-NEXT:    s_xor_b32 s4, s4, s9
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    v_xor_b32_e32 v1, s9, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX9-NEXT:    s_mov_b32 s8, 0x4f7ffffe
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s1, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX9-NEXT:    s_add_i32 s1, s1, s2
-; GFX9-NEXT:    s_xor_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshl_b32 s1, s0, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX9-NEXT:    s_add_i32 s1, s1, s6
+; GFX9-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX9-NEXT:    s_ashr_i32 s2, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s2
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s6
+; GFX9-NEXT:    s_xor_b32 s0, s0, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX9-NEXT:    s_sub_i32 s3, 0, s1
-; GFX9-NEXT:    s_ashr_i32 s2, s6, 31
+; GFX9-NEXT:    s_sub_i32 s7, 0, s1
+; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s8, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_add_i32 s4, s4, s6
+; GFX9-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s8, v1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s3, s6, s2
-; GFX9-NEXT:    s_sub_i32 s6, 0, s0
+; GFX9-NEXT:    s_sub_i32 s7, 0, s0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v1
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
-; GFX9-NEXT:    s_ashr_i32 s6, s7, 31
+; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v1
+; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_xor_b32 s7, s7, s6
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    s_xor_b32 s5, s5, s7
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s0
-; GFX9-NEXT:    v_sub_u32_e32 v0, s3, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
@@ -7072,11 +7061,11 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s6, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s6, v1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s7, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s7, v1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = srem <2 x i32> %x, %shl.y
@@ -7703,16 +7692,15 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_add_i32 s4, s4, 12
-; GFX6-NEXT:    s_add_i32 s6, s6, 12
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[8:9], s4
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], s6
+; GFX6-NEXT:    s_add_i32 s8, s8, 12
+; GFX6-NEXT:    s_add_i32 s9, s10, 12
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
@@ -7722,20 +7710,19 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ;
 ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s4, 12
-; GFX9-NEXT:    s_add_i32 s4, s6, 12
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_add_i32 s2, s8, 12
+; GFX9-NEXT:    s_add_i32 s8, s10, 12
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = udiv <2 x i64> %x, %shl.y
@@ -8135,49 +8122,47 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
-; GFX6-NEXT:    s_add_u32 s0, s0, -1
-; GFX6-NEXT:    s_addc_u32 s1, s1, -1
-; GFX6-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
-; GFX6-NEXT:    s_add_u32 s2, s2, -1
-; GFX6-NEXT:    s_addc_u32 s3, s3, -1
-; GFX6-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[12:13], s6
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[12:13], s4
+; GFX6-NEXT:    s_add_u32 s4, s4, -1
+; GFX6-NEXT:    s_addc_u32 s5, s5, -1
+; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_add_u32 s4, s6, -1
+; GFX6-NEXT:    s_addc_u32 s5, s7, -1
+; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
-; GFX9-NEXT:    s_add_u32 s0, s0, -1
-; GFX9-NEXT:    s_addc_u32 s1, s1, -1
-; GFX9-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
-; GFX9-NEXT:    s_add_u32 s4, s6, -1
-; GFX9-NEXT:    s_addc_u32 s5, s7, -1
-; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_add_u32 s2, s2, -1
+; GFX9-NEXT:    s_addc_u32 s3, s3, -1
+; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT:    s_add_u32 s4, s10, -1
+; GFX9-NEXT:    s_addc_u32 s5, s11, -1
+; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = urem <2 x i64> %x, %shl.y
@@ -9091,42 +9076,42 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
 ; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
 ; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
 ; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
-; GFX6-NEXT:    s_add_u32 s2, s2, s12
-; GFX6-NEXT:    s_mov_b32 s13, s12
-; GFX6-NEXT:    s_addc_u32 s3, s3, s12
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[12:13], s8
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s10
+; GFX6-NEXT:    s_ashr_i32 s14, s9, 31
+; GFX6-NEXT:    s_add_u32 s8, s8, s14
+; GFX6-NEXT:    s_mov_b32 s15, s14
+; GFX6-NEXT:    s_addc_u32 s9, s9, s14
+; GFX6-NEXT:    s_xor_b64 s[12:13], s[8:9], s[14:15]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
-; GFX6-NEXT:    s_sub_u32 s6, 0, s10
-; GFX6-NEXT:    s_subb_u32 s7, 0, s11
+; GFX6-NEXT:    s_sub_u32 s10, 0, s12
+; GFX6-NEXT:    s_subb_u32 s11, 0, s13
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
+; GFX6-NEXT:    s_ashr_i32 s16, s5, 31
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GFX6-NEXT:    s_add_u32 s0, s4, s16
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s14, s1, 31
-; GFX6-NEXT:    s_add_u32 s0, s0, s14
-; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
-; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
-; GFX6-NEXT:    s_mov_b32 s15, s14
+; GFX6-NEXT:    s_mov_b32 s17, s16
+; GFX6-NEXT:    s_addc_u32 s1, s5, s16
+; GFX6-NEXT:    v_mul_lo_u32 v0, s10, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v2
+; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[16:17]
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v2, v3
@@ -9138,8 +9123,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX6-NEXT:    s_addc_u32 s1, s1, s14
-; GFX6-NEXT:    s_xor_b64 s[16:17], s[0:1], s[14:15]
+; GFX6-NEXT:    s_xor_b64 s[14:15], s[16:17], s[14:15]
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
@@ -9148,13 +9132,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s7, v2
-; GFX6-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
-; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v2
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v2
+; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v4
@@ -9172,57 +9155,59 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s16, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, s16, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s17, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s17, v1
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v6, s5, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s5, v1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s17, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s17, v2
-; GFX6-NEXT:    s_add_u32 s8, s8, s12
-; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v1
-; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, s11
-; GFX6-NEXT:    s_addc_u32 s9, s9, s12
+; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v2
+; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v1
+; GFX6-NEXT:    v_mul_lo_u32 v5, s13, v1
+; GFX6-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v1
+; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s17, v3
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s16, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s10, v4
+; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s12, v4
 ; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v6, s[0:1], 2, v1
 ; GFX6-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v1
 ; GFX6-NEXT:    v_addc_u32_e64 v9, s[0:1], 0, v2, s[0:1]
+; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX6-NEXT:    s_add_u32 s2, s2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v7, s17
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s9
+; GFX6-NEXT:    v_mov_b32_e32 v7, s5
+; GFX6-NEXT:    s_mov_b32 s5, s4
+; GFX6-NEXT:    s_addc_u32 s3, s3, s4
+; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s3
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v3
 ; GFX6-NEXT:    v_mac_f32_e32 v9, s18, v10
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
 ; GFX6-NEXT:    v_rcp_f32_e32 v4, v9
@@ -9235,13 +9220,13 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mac_f32_e32 v4, s21, v5
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX6-NEXT:    s_sub_u32 s0, 0, s8
+; GFX6-NEXT:    s_sub_u32 s0, 0, s2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v5
-; GFX6-NEXT:    s_subb_u32 s1, 0, s9
+; GFX6-NEXT:    s_subb_u32 s1, 0, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s1, v4
-; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
+; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
@@ -9254,7 +9239,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v6
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX6-NEXT:    s_mov_b32 s11, s10
+; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s14, v1
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v6, vcc
@@ -9267,7 +9252,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v6, s0, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s1, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s15, v2
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
@@ -9285,46 +9269,45 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v7, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GFX6-NEXT:    s_add_u32 s0, s2, s10
+; GFX6-NEXT:    s_add_u32 s0, s6, s12
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    s_addc_u32 s1, s3, s10
+; GFX6-NEXT:    s_addc_u32 s1, s7, s12
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX6-NEXT:    s_xor_b64 s[2:3], s[0:1], s[10:11]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v4
-; GFX6-NEXT:    v_mul_hi_u32 v6, s2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, s2, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, s3, v4
-; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v4
+; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
+; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v4
+; GFX6-NEXT:    v_mul_hi_u32 v6, s6, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, s6, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, s7, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s3, v3
+; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s15
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v9, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s8, v4
-; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v3
+; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v6, s2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v1
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v7, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v3
+; GFX6-NEXT:    v_mul_lo_u32 v2, s3, v3
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GFX6-NEXT:    v_mov_b32_e32 v7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s3, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v3
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s7, v2
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s6, v5
 ; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s8, v5
+; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s2, v5
 ; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v3
 ; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v4, s[0:1]
@@ -9332,59 +9315,58 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v4, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v8, s3
+; GFX6-NEXT:    v_mov_b32_e32 v8, s7
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[10:11], s[12:13]
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[4:5]
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v4, s1, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v3
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
 ; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
 ; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
 ; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s12
 ; GFX9-NEXT:    s_mov_b32 s13, s12
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s12
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
-; GFX9-NEXT:    s_sub_u32 s2, 0, s10
-; GFX9-NEXT:    s_subb_u32 s3, 0, s11
+; GFX9-NEXT:    s_sub_u32 s2, 0, s8
+; GFX9-NEXT:    s_subb_u32 s3, 0, s9
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
+; GFX9-NEXT:    s_mov_b32 s15, s14
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX9-NEXT:    s_mov_b32 s15, s14
 ; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v2
@@ -9449,44 +9431,44 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v2, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v2
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v1
-; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v1
+; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v1
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s4, v4
 ; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v4
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v4
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v1
 ; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v1
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v2, s[0:1]
-; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    s_add_u32 s8, s8, s4
+; GFX9-NEXT:    s_add_u32 s10, s10, s4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s9, s9, s4
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s9
+; GFX9-NEXT:    s_addc_u32 s11, s11, s4
+; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[4:5]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s11
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_mac_f32_e32 v9, s16, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v9
@@ -9499,8 +9481,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mac_f32_e32 v4, s19, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    s_sub_u32 s0, 0, s8
-; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    s_sub_u32 s0, 0, s10
+; GFX9-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s1, v4
@@ -9517,8 +9499,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
-; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX9-NEXT:    s_mov_b32 s11, s10
+; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
+; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v0, vcc
@@ -9530,7 +9512,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s1, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s0, v3
-; GFX9-NEXT:    s_add_u32 s0, s6, s10
+; GFX9-NEXT:    s_add_u32 s0, s6, s8
 ; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v5
@@ -9548,9 +9530,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    s_addc_u32 s1, s7, s10
+; GFX9-NEXT:    s_addc_u32 s1, s7, s8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s6, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s6, v4
@@ -9567,26 +9549,26 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s8, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s9, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v4
+; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s13
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s12, v1
 ; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v3
+; GFX9-NEXT:    v_mul_lo_u32 v6, s10, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v5
-; GFX9-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
 ; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, s6, v6
 ; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s8, v6
+; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s10, v6
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 2, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v4, s[0:1]
@@ -9596,16 +9578,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v6
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v4, s1, v4
@@ -10272,19 +10254,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
+; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
 ; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
 ; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX6-NEXT:    s_add_u32 s2, s2, s4
-; GFX6-NEXT:    s_mov_b32 s5, s4
-; GFX6-NEXT:    s_addc_u32 s3, s3, s4
-; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s10
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX6-NEXT:    s_add_u32 s2, s2, s8
+; GFX6-NEXT:    s_mov_b32 s9, s8
+; GFX6-NEXT:    s_addc_u32 s3, s3, s8
+; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s17
 ; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
@@ -10292,24 +10274,22 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    s_subb_u32 s3, 0, s17
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_ashr_i32 s12, s5, 31
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GFX6-NEXT:    s_add_u32 s0, s4, s12
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
-; GFX6-NEXT:    s_add_u32 s0, s8, s12
+; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    s_addc_u32 s1, s5, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[12:13]
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v2, v3
@@ -10321,8 +10301,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX6-NEXT:    s_addc_u32 s1, s9, s12
-; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
@@ -10353,15 +10333,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, s8, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s9, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v6, s5, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s5, v1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s9, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v0, vcc
@@ -10373,9 +10353,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s16, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s9, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s17
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s8, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
 ; GFX6-NEXT:    v_subb_u32_e64 v3, s[0:1], v3, v4, vcc
 ; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v1
 ; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[2:3], 0, v3, s[0:1]
@@ -10390,14 +10370,14 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
 ; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX6-NEXT:    s_add_u32 s8, s14, s2
+; GFX6-NEXT:    s_add_u32 s4, s14, s2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v6, s9
+; GFX6-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX6-NEXT:    s_mov_b32 s3, s2
-; GFX6-NEXT:    s_addc_u32 s9, s15, s2
-; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s9
+; GFX6-NEXT:    s_addc_u32 s5, s15, s2
+; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s5
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v2
 ; GFX6-NEXT:    v_mac_f32_e32 v7, s18, v8
@@ -10416,13 +10396,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mac_f32_e32 v4, s21, v5
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX6-NEXT:    s_sub_u32 s0, 0, s8
+; GFX6-NEXT:    s_sub_u32 s0, 0, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v5
-; GFX6-NEXT:    s_subb_u32 s1, 0, s9
+; GFX6-NEXT:    s_subb_u32 s1, 0, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s1, v4
-; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
+; GFX6-NEXT:    s_ashr_i32 s14, s7, 31
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
@@ -10465,58 +10445,58 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v7, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
-; GFX6-NEXT:    s_add_u32 s0, s10, s14
+; GFX6-NEXT:    s_add_u32 s0, s6, s14
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    s_addc_u32 s1, s11, s14
+; GFX6-NEXT:    s_addc_u32 s1, s7, s14
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v4
-; GFX6-NEXT:    v_mul_hi_u32 v6, s10, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, s10, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, s11, v4
-; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v4
+; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[14:15]
+; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v4
+; GFX6-NEXT:    v_mul_hi_u32 v6, s6, v3
+; GFX6-NEXT:    v_mul_hi_u32 v8, s6, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, s7, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v8, s11, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s11, v3
+; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s12
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v9, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v0, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s8, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
+; GFX6-NEXT:    v_mul_hi_u32 v5, s4, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v1
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v7, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v3
+; GFX6-NEXT:    v_mul_lo_u32 v2, s5, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s9
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, s5
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v3
+; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s4, v3
 ; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v6
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s5, v7
+; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s4, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v7, s11
+; GFX6-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v7, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -10527,24 +10507,25 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v3
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
 ; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
 ; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_u32 s2, s2, s4
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX9-NEXT:    s_add_u32 s2, s2, s8
+; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    s_addc_u32 s3, s3, s8
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
@@ -10552,16 +10533,14 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_subb_u32 s3, 0, s13
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
index baff889c521e8..654aac83b9ccc 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -29,16 +29,15 @@ define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x
 }
 
 ; GCN-LABEL: {{^}}v_ashr_v2i16:
-; GCN: {{buffer|flat|global}}_load_dword [[LHS:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[RHS:v[0-9]+]]
-; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+; GCN: {{buffer|flat|global}}_load_dwordx2 v[[[LHS:[0-9]+]]:[[RHS:[0-9]+]]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], v[[RHS]], v[[LHS]]
 
 ; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
-; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v[[LHS]]
 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -117,8 +116,7 @@ define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 }
 
 ; GCN-LABEL: {{^}}v_ashr_v4i16:
-; GCN: {{buffer|flat|global}}_load_dwordx2
-; GCN: {{buffer|flat|global}}_load_dwordx2
+; GCN: {{buffer|flat|global}}_load_dwordx4
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
index 6aa3d0bc91cbe..b3ae7e0530bb3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -157,8 +157,8 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)*
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
 
-  %a = load <1 x float>, <1 x float> addrspace(1)* %gep.0
-  %b = load <1 x float>, <1 x float> addrspace(1)* %gep.1
+  %a = load volatile <1 x float>, <1 x float> addrspace(1)* %gep.0
+  %b = load volatile <1 x float>, <1 x float> addrspace(1)* %gep.1
 
   %cmp = fcmp ogt <1 x float> %a, %b
   %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index 051cb5b40269d..9435b8278f300 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -187,8 +187,8 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)*
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
 
-  %a = load <1 x float>, <1 x float> addrspace(1)* %gep.0
-  %b = load <1 x float>, <1 x float> addrspace(1)* %gep.1
+  %a = load volatile <1 x float>, <1 x float> addrspace(1)* %gep.0
+  %b = load volatile <1 x float>, <1 x float> addrspace(1)* %gep.1
 
   %cmp = fcmp ult <1 x float> %a, %b
   %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b
@@ -214,8 +214,8 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)*
   %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1
 
-  %a = load <2 x float>, <2 x float> addrspace(1)* %gep.0
-  %b = load <2 x float>, <2 x float> addrspace(1)* %gep.1
+  %a = load volatile <2 x float>, <2 x float> addrspace(1)* %gep.0
+  %b = load volatile <2 x float>, <2 x float> addrspace(1)* %gep.1
 
   %cmp = fcmp ult <2 x float> %a, %b
   %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b

diff  --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index f3d220b684be5..edba47666197f 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -161,47 +161,45 @@ entry:
 define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
 ; SI-LABEL: fshl_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
 ; SI-NEXT:    s_not_b32 s1, s1
-; SI-NEXT:    s_lshr_b32 s3, s9, 1
+; SI-NEXT:    s_lshr_b32 s2, s5, 1
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    s_not_b32 s0, s0
-; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
-; SI-NEXT:    s_lshr_b32 s1, s8, 1
+; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
+; SI-NEXT:    s_lshr_b32 s1, s4, 1
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v2
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshl_v2i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_lshr_b32 s3, s5, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    s_not_b32 s3, s3
+; VI-NEXT:    s_lshr_b32 s7, s5, 1
 ; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
-; VI-NEXT:    s_not_b32 s5, s7
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_not_b32 s3, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    s_not_b32 s2, s2
 ; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT:    s_lshr_b32 s2, s4, 1
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_alignbit_b32 v0, s2, v0, v2
+; VI-NEXT:    s_lshr_b32 s3, s4, 1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_alignbit_b32 v0, s3, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -209,25 +207,24 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ;
 ; GFX9-LABEL: fshl_v2i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-NEXT:    s_lshr_b32 s0, s5, 1
 ; GFX9-NEXT:    s_not_b32 s1, s9
+; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_not_b32 s1, s8
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v2i32:
@@ -250,21 +247,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ;
 ; GFX10-LABEL: fshl_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
 ; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s3, 1
-; GFX10-NEXT:    v_alignbit_b32 v0, s3, s5, 1
-; GFX10-NEXT:    v_alignbit_b32 v3, s2, s4, 1
-; GFX10-NEXT:    s_not_b32 s1, s7
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX10-NEXT:    s_not_b32 s3, s6
+; GFX10-NEXT:    v_alignbit_b32 v0, s5, s7, 1
+; GFX10-NEXT:    v_alignbit_b32 v3, s4, s6, 1
+; GFX10-NEXT:    s_lshr_b32 s0, s5, 1
+; GFX10-NEXT:    s_not_b32 s1, s3
+; GFX10-NEXT:    s_lshr_b32 s3, s4, 1
+; GFX10-NEXT:    s_not_b32 s2, s2
 ; GFX10-NEXT:    v_alignbit_b32 v1, s0, v0, s1
-; GFX10-NEXT:    v_alignbit_b32 v0, s2, v3, s3
+; GFX10-NEXT:    v_alignbit_b32 v0, s3, v3, s2
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 entry:
@@ -276,27 +272,25 @@ entry:
 define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 ; SI-LABEL: fshl_v2i32_imm:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 23
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_alignbit_b32 v0, s6, v0, 25
+; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 23
+; SI-NEXT:    v_alignbit_b32 v0, s4, v2, 25
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshl_v2i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 23
 ; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 25
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -306,16 +300,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
 ;
 ; GFX9-LABEL: fshl_v2i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 23
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 25
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v2i32_imm:
@@ -334,15 +327,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
 ;
 ; GFX10-LABEL: fshl_v2i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s3, s5, 23
-; GFX10-NEXT:    v_alignbit_b32 v0, s2, s4, 25
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, 23
+; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, 25
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
@@ -353,69 +345,67 @@ entry:
 define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; SI-LABEL: fshl_v4i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
-; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x15
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    v_alignbit_b32 v0, s15, v0, 1
-; SI-NEXT:    s_not_b32 s3, s3
-; SI-NEXT:    s_lshr_b32 s7, s15, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    s_not_b32 s11, s15
+; SI-NEXT:    v_alignbit_b32 v0, s7, v0, 1
+; SI-NEXT:    s_lshr_b32 s7, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s11
 ; SI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    s_not_b32 s2, s2
-; SI-NEXT:    v_alignbit_b32 v0, s14, v0, 1
-; SI-NEXT:    s_lshr_b32 s3, s14, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    s_not_b32 s1, s1
-; SI-NEXT:    v_alignbit_b32 v0, s13, v0, 1
-; SI-NEXT:    s_lshr_b32 s2, s13, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_not_b32 s0, s0
-; SI-NEXT:    v_alignbit_b32 v0, s12, v0, 1
-; SI-NEXT:    s_lshr_b32 s1, s12, 1
-; SI-NEXT:    v_mov_b32_e32 v4, s0
-; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v4
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_not_b32 s7, s14
+; SI-NEXT:    v_alignbit_b32 v0, s6, v0, 1
+; SI-NEXT:    s_lshr_b32 s6, s6, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_alignbit_b32 v2, s6, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    s_not_b32 s6, s13
+; SI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
+; SI-NEXT:    s_lshr_b32 s5, s5, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_not_b32 s5, s12
+; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
+; SI-NEXT:    s_lshr_b32 s4, s4, 1
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshl_v4i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    s_lshr_b32 s2, s11, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s11
 ; VI-NEXT:    s_not_b32 s3, s15
-; VI-NEXT:    v_alignbit_b32 v0, s11, v0, 1
+; VI-NEXT:    s_lshr_b32 s2, s7, 1
+; VI-NEXT:    v_alignbit_b32 v0, s7, v0, 1
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_alignbit_b32 v3, s2, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v0, s10
 ; VI-NEXT:    s_not_b32 s3, s14
-; VI-NEXT:    v_alignbit_b32 v0, s10, v0, 1
-; VI-NEXT:    s_lshr_b32 s2, s10, 1
+; VI-NEXT:    v_alignbit_b32 v0, s6, v0, 1
+; VI-NEXT:    s_lshr_b32 s2, s6, 1
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s9
 ; VI-NEXT:    s_not_b32 s3, s13
-; VI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
-; VI-NEXT:    s_lshr_b32 s2, s9, 1
+; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
+; VI-NEXT:    s_lshr_b32 s2, s5, 1
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-NEXT:    s_not_b32 s3, s12
-; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
-; VI-NEXT:    s_lshr_b32 s2, s8, 1
+; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
+; VI-NEXT:    s_lshr_b32 s2, s4, 1
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
 ; VI-NEXT:    v_alignbit_b32 v0, s2, v0, v4
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -425,34 +415,33 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ;
 ; GFX9-LABEL: fshl_v4i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_alignbit_b32 v0, s11, v0, 1
-; GFX9-NEXT:    s_lshr_b32 s0, s11, 1
 ; GFX9-NEXT:    s_not_b32 s1, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s11
+; GFX9-NEXT:    s_lshr_b32 s0, s7, 1
+; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v3, s0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX9-NEXT:    s_not_b32 s1, s14
-; GFX9-NEXT:    v_alignbit_b32 v0, s10, v0, 1
-; GFX9-NEXT:    s_lshr_b32 s0, s10, 1
+; GFX9-NEXT:    v_alignbit_b32 v0, s6, v0, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX9-NEXT:    s_not_b32 s1, s13
-; GFX9-NEXT:    v_alignbit_b32 v0, s9, v0, 1
-; GFX9-NEXT:    s_lshr_b32 s0, s9, 1
+; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    s_not_b32 s1, s12
-; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 1
-; GFX9-NEXT:    s_lshr_b32 s0, s8, 1
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
@@ -486,30 +475,29 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ;
 ; GFX10-LABEL: fshl_v4i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s7, 1
 ; GFX10-NEXT:    v_alignbit_b32 v0, s7, s11, 1
 ; GFX10-NEXT:    v_alignbit_b32 v1, s6, s10, 1
 ; GFX10-NEXT:    v_alignbit_b32 v5, s5, s9, 1
 ; GFX10-NEXT:    v_alignbit_b32 v6, s4, s8, 1
-; GFX10-NEXT:    s_not_b32 s1, s15
+; GFX10-NEXT:    s_lshr_b32 s2, s7, 1
+; GFX10-NEXT:    s_not_b32 s3, s15
 ; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
 ; GFX10-NEXT:    s_not_b32 s7, s14
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 1
 ; GFX10-NEXT:    s_not_b32 s9, s13
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX10-NEXT:    s_not_b32 s8, s12
-; GFX10-NEXT:    v_alignbit_b32 v3, s0, v0, s1
+; GFX10-NEXT:    v_alignbit_b32 v3, s2, v0, s3
 ; GFX10-NEXT:    v_alignbit_b32 v2, s6, v1, s7
 ; GFX10-NEXT:    v_alignbit_b32 v1, s5, v5, s9
 ; GFX10-NEXT:    v_alignbit_b32 v0, s4, v6, s8
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 entry:
   %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
@@ -520,58 +508,55 @@ entry:
 define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 ; SI-LABEL: fshl_v4i32_imm:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 23
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 25
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_alignbit_b32 v2, s6, v1, 23
+; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 25
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 31
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshl_v4i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_alignbit_b32 v2, s10, v1, 23
-; VI-NEXT:    v_alignbit_b32 v1, s9, v0, 25
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s11
+; VI-NEXT:    v_mov_b32_e32 v1, s10
+; VI-NEXT:    v_mov_b32_e32 v4, s9
+; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 31
+; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 23
+; VI-NEXT:    v_alignbit_b32 v1, s5, v4, 25
+; VI-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
+; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 31
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshl_v4i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, 31
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    v_alignbit_b32 v2, s10, v1, 23
-; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, 25
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 31
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s11
+; GFX9-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 31
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 23
+; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 25
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 31
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v4i32_imm:
@@ -594,9 +579,8 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
 ;
 ; GFX10-LABEL: fshl_v4i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 5214a02b9f74f..1c041d0c7d700 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -156,35 +156,33 @@ entry:
 define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
 ; SI-LABEL: fshr_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xf
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v2i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x3c
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_alignbit_b32 v0, s6, v0, v2
+; VI-NEXT:    v_alignbit_b32 v0, s4, v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -192,18 +190,17 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ;
 ; GFX9-LABEL: fshr_v2i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x3c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_alignbit_b32 v1, s7, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v0, s6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -223,10 +220,9 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ;
 ; GFX10-LABEL: fshr_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -245,27 +241,25 @@ entry:
 define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 ; SI-LABEL: fshr_v2i32_imm:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 9
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_alignbit_b32 v0, s6, v0, 7
+; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
+; SI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v2i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
 ; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -275,16 +269,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
 ;
 ; GFX9-LABEL: fshr_v2i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 7
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v2i32_imm:
@@ -303,15 +296,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
 ;
 ; GFX10-LABEL: fshr_v2i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s3, s5, 9
-; GFX10-NEXT:    v_alignbit_b32 v0, s2, s4, 7
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, 9
+; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, 7
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
@@ -322,47 +314,45 @@ entry:
 define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; SI-LABEL: fshr_v4i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x15
-; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s15, 0xf000
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x15
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    v_alignbit_b32 v3, s3, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s10
-; SI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_mov_b32 s14, -1
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v4
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s15
+; SI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s14
+; SI-NEXT:    v_alignbit_b32 v2, s6, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v4i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x54
-; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v1, s11
-; VI-NEXT:    v_alignbit_b32 v3, s15, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s10
-; VI-NEXT:    v_alignbit_b32 v2, s14, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_alignbit_b32 v1, s13, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v4, s8
-; VI-NEXT:    v_alignbit_b32 v0, s12, v0, v4
+; VI-NEXT:    v_mov_b32_e32 v0, s11
+; VI-NEXT:    v_mov_b32_e32 v1, s15
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s14
+; VI-NEXT:    v_alignbit_b32 v2, s6, v2, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s9
+; VI-NEXT:    v_mov_b32_e32 v1, s13
+; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -370,25 +360,24 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ;
 ; GFX9-LABEL: fshr_v4i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x54
-; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-NEXT:    v_alignbit_b32 v3, s15, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-NEXT:    v_alignbit_b32 v2, s14, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    v_alignbit_b32 v1, s13, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-NEXT:    v_alignbit_b32 v0, s12, v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s11
+; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-NEXT:    v_alignbit_b32 v2, s6, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v5
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v4i32:
@@ -411,21 +400,20 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ;
 ; GFX10-LABEL: fshr_v4i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x54
-; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
-; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s7
-; GFX10-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-NEXT:    v_mov_b32_e32 v4, s5
-; GFX10-NEXT:    v_mov_b32_e32 v5, s4
-; GFX10-NEXT:    v_alignbit_b32 v3, s15, s11, v0
-; GFX10-NEXT:    v_alignbit_b32 v2, s14, s10, v1
-; GFX10-NEXT:    v_alignbit_b32 v1, s13, s9, v4
-; GFX10-NEXT:    v_alignbit_b32 v0, s12, s8, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s15
+; GFX10-NEXT:    v_mov_b32_e32 v1, s14
+; GFX10-NEXT:    v_mov_b32_e32 v4, s13
+; GFX10-NEXT:    v_mov_b32_e32 v5, s12
+; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, v0
+; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, v1
+; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, v4
+; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, v5
 ; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 entry:
@@ -437,58 +425,55 @@ entry:
 define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 ; SI-LABEL: fshr_v4i32_imm:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 9
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
+; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v4i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_alignbit_b32 v2, s10, v1, 9
-; VI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s11
+; VI-NEXT:    v_mov_b32_e32 v1, s10
+; VI-NEXT:    v_mov_b32_e32 v4, s9
+; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
+; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
+; VI-NEXT:    v_alignbit_b32 v1, s5, v4, 7
+; VI-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
+; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshr_v4i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    v_alignbit_b32 v2, s10, v1, 9
-; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, 7
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 1
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s11
+; GFX9-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
+; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v4i32_imm:
@@ -509,9 +494,8 @@ define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
 ;
 ; GFX10-LABEL: fshr_v4i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index dc402e13068c5..9669f01ea3e27 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -1897,8 +1897,7 @@ define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #
 define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
 ; SI-LABEL: fadd_v2f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s0, s[4:5], 0x2
-; SI-NEXT:    s_load_dword s1, s[4:5], 0x3
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s2, s0, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
@@ -1921,20 +1920,19 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ;
 ; VI-LABEL: fadd_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0xc
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x8
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_add_f16_e32 v1, s3, v1
-; VI-NEXT:    v_or_b32_e32 v2, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshr_b32 s4, s1, 16
+; VI-NEXT:    s_lshr_b32 s5, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v0, s0, v0
+; VI-NEXT:    v_or_b32_e32 v2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %add = fadd <2 x half> %a, %b
@@ -1949,20 +1947,14 @@ define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    s_add_u32 s2, s2, 8
-; SI-NEXT:    s_addc_u32 s3, s3, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_mov_b32_e32 v3, s3
-; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; SI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    v_mov_b32_e32 v5, s1
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v3
@@ -1990,14 +1982,9 @@ define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 30762f36ed552..870f4fcb8996d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -186,8 +186,7 @@ bb:
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16:
 ; GCN:              s_load_dwordx4
-; GCN:              s_load_dwordx2
-; GCN:              s_load_dwordx2
+; GCN:              s_load_dwordx4
 ; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GFX908_A:         v_mfma_f32_4x4x4f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GFX908-COUNT-4:   v_accvgpr_read_b32

diff  --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 0ced63b67d91f..6ad048b4e2fae 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -7,20 +7,18 @@
 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dword s5, s[0:1], 0x30
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s5, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s3, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_lshr_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dword s3, s[0:1], 0x30
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s4, s2, 0xffff
@@ -38,33 +36,31 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ;
 ; CI-LABEL: s_lshr_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xc
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s1, s2, 0xffff
-; CI-NEXT:    s_lshr_b32 s2, s2, 16
-; CI-NEXT:    s_lshr_b32 s3, s0, 16
-; CI-NEXT:    s_lshr_b32 s2, s2, s3
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    s_lshr_b32 s0, s1, s0
-; CI-NEXT:    s_or_b32 s0, s0, s2
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_and_b32 s6, s4, 0xffff
+; CI-NEXT:    s_lshr_b32 s4, s4, 16
+; CI-NEXT:    s_lshr_b32 s7, s5, 16
+; CI-NEXT:    s_lshr_b32 s4, s4, s7
+; CI-NEXT:    s_lshl_b32 s4, s4, 16
+; CI-NEXT:    s_lshr_b32 s5, s6, s5
+; CI-NEXT:    s_or_b32 s4, s5, s4
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_lshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dword s5, s[0:1], 0x30
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s5, s4
-; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %result = lshr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
@@ -75,35 +71,31 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-LABEL: v_lshr_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_lshr_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e32 v3, v2, v5
-; VI-NEXT:    v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshrrev_b16_e32 v4, v1, v0
+; VI-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_lshr_v2i16:
@@ -115,13 +107,11 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -133,14 +123,12 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX10-LABEL: v_lshr_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -458,8 +446,7 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
@@ -474,10 +461,7 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -496,31 +480,29 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, 0
-; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    v_mov_b32_e32 v5, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
 ; CI-NEXT:    s_mov_b32 s4, 0xffff
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; CI-NEXT:    v_and_b32_e32 v2, s4, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; CI-NEXT:    v_and_b32_e32 v3, s4, v3
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; CI-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
-; CI-NEXT:    v_lshrrev_b32_e32 v5, v9, v7
-; CI-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v4, v8, v6
-; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; CI-NEXT:    v_or_b32_e32 v3, v3, v5
-; CI-NEXT:    v_or_b32_e32 v2, v2, v4
-; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; CI-NEXT:    v_and_b32_e32 v0, s4, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; CI-NEXT:    v_and_b32_e32 v1, s4, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v3, v9, v7
+; CI-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v2, v8, v6
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_or_b32_e32 v1, v1, v3
+; CI-NEXT:    v_or_b32_e32 v0, v0, v2
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_lshr_v4i16:
@@ -528,9 +510,7 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v2, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 1fe61b2f9c9b3..75139bfa1cf1f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -71,50 +71,42 @@ bb:
 define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
 ; GCN-LABEL: scalar_clause:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x2c
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[12:13], 0x0
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[12:13], 0x10
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x20
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x30
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NEXT:    v_mov_b32_e32 v8, s8
 ; GCN-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-NEXT:    v_mov_b32_e32 v6, s6
 ; GCN-NEXT:    v_mov_b32_e32 v7, s7
+; GCN-NEXT:    v_mov_b32_e32 v8, s8
 ; GCN-NEXT:    v_mov_b32_e32 v9, s9
 ; GCN-NEXT:    v_mov_b32_e32 v10, s10
 ; GCN-NEXT:    v_mov_b32_e32 v11, s11
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[16:17]
-; GCN-NEXT:    global_store_dwordx4 v12, v[4:7], s[16:17] offset:16
-; GCN-NEXT:    global_store_dwordx4 v12, v[8:11], s[16:17] offset:32
-; GCN-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
+; GCN-NEXT:    v_mov_b32_e32 v12, s12
+; GCN-NEXT:    v_mov_b32_e32 v13, s13
+; GCN-NEXT:    v_mov_b32_e32 v14, s14
+; GCN-NEXT:    v_mov_b32_e32 v15, s15
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-SCRATCH-LABEL: scalar_clause:
 ; GCN-SCRATCH:       ; %bb.0: ; %bb
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
-; GCN-SCRATCH-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GCN-SCRATCH-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-SCRATCH-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x2c
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-SCRATCH-NEXT:    s_clause 0x3
-; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[12:13], 0x0
-; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[12:13], 0x10
-; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x20
-; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x30
+; GCN-SCRATCH-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 27366601cd09b..c5e83127166bb 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -107,8 +107,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16:
-; GCN: s_load_dword s
-; GCN: s_load_dword s
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
 
 ; SI: s_ashr_i32
 ; SI: s_sext_i32_i16

diff  --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 475655b34772a..478701cc97486 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -709,13 +709,12 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
 ; SI-NEXT:  ; %bb.1: ; %bb7
 ; SI-NEXT:    s_endpgm
 ; SI-NEXT:  .LBB8_2: ; %bb11
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xf
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_i32 s2, s2, 0x180000
-; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT:    s_bfe_i32 s2, s4, 0x180000
+; SI-NEXT:    s_bfe_i32 s4, s6, 0x180000
 ; SI-NEXT:    s_mul_i32 s4, s2, s4
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
@@ -732,14 +731,13 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
 ; VI-NEXT:  ; %bb.1: ; %bb7
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:  .LBB8_2: ; %bb11
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
-; VI-NEXT:    s_load_dword s5, s[0:1], 0x3c
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT:    s_bfe_i32 s5, s5, 0x180000
+; VI-NEXT:    s_bfe_i32 s5, s6, 0x180000
 ; VI-NEXT:    s_mul_i32 s4, s4, s5
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -755,18 +753,17 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
 ; GFX9-NEXT:  ; %bb.1: ; %bb7
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:  .LBB8_2: ; %bb11
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x3c
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s1, s6, 0x180000
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: simplify_i24_crash:

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index f255a4547fa10..a0421e5adc73b 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -440,15 +440,15 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v10, v5, v3
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
+; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v1, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
@@ -514,15 +514,15 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; TONGA-NEXT:    v_mul_lo_u32 v10, v5, v3
 ; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v6, v0
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v6
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v10
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
 ; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
+; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, v0, v2
 ; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
-; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
+; TONGA-NEXT:    v_sub_u32_e32 v7, vcc, v1, v3
 ; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
 ; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
@@ -533,8 +533,8 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; TONGA-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v9
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
-; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v9
+; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v8, v0
+; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v9, v1
 ; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
 ;
@@ -614,27 +614,26 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ;
 ; EG-LABEL: sdiv_v2i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 51, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 51, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
-; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     SETGT_INT * T0.W, 0.0, T1.Y,
-; EG-NEXT:     ADD_INT T1.W, T1.Y, PV.W,
-; EG-NEXT:     SETGT_INT * T2.W, 0.0, T1.X,
-; EG-NEXT:     XOR_INT * T1.W, PV.W, T0.W,
-; EG-NEXT:     SUB_INT T0.Z, 0.0, PV.W,
-; EG-NEXT:     ADD_INT T3.W, T1.X, T2.W,
-; EG-NEXT:     RECIP_UINT * T1.X, PV.W,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     SETGT_INT * T1.W, 0.0, T0.W,
+; EG-NEXT:     ADD_INT T0.W, T0.W, PV.W,
+; EG-NEXT:     SETGT_INT * T2.W, 0.0, T0.Z,
+; EG-NEXT:     XOR_INT * T0.W, PV.W, T1.W,
+; EG-NEXT:     SUB_INT T1.Z, 0.0, PV.W,
+; EG-NEXT:     ADD_INT T3.W, T0.Z, T2.W,
+; EG-NEXT:     RECIP_UINT * T0.Z, PV.W,
 ; EG-NEXT:     XOR_INT T3.W, PV.W, T2.W,
-; EG-NEXT:     MULLO_INT * T0.Z, PV.Z, PS,
+; EG-NEXT:     MULLO_INT * T1.X, PV.Z, PS,
 ; EG-NEXT:     SUB_INT T4.W, 0.0, PV.W,
 ; EG-NEXT:     RECIP_UINT * T1.Y, PV.W,
 ; EG-NEXT:     SETGT_INT T5.W, 0.0, T0.X,
@@ -645,18 +644,18 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; EG-NEXT:     ADD_INT T1.Y, T1.Y, PS,
 ; EG-NEXT:     XOR_INT T1.Z, PV.W, T5.W,
 ; EG-NEXT:     ADD_INT T4.W, T0.Y, PV.Z, BS:VEC_120/SCL_212
-; EG-NEXT:     MULHI * T0.X, T1.X, T0.Z,
-; EG-NEXT:     ADD_INT T0.Z, T1.X, PS,
+; EG-NEXT:     MULHI * T0.X, T0.Z, T1.X,
+; EG-NEXT:     ADD_INT T0.Z, T0.Z, PS,
 ; EG-NEXT:     XOR_INT T4.W, PV.W, T2.Z,
 ; EG-NEXT:     MULHI * T0.X, PV.Z, PV.Y,
 ; EG-NEXT:     MULHI * T0.Y, PV.W, PV.Z,
-; EG-NEXT:     MULLO_INT * T0.Z, PS, T1.W,
+; EG-NEXT:     MULLO_INT * T0.Z, PS, T0.W,
 ; EG-NEXT:     SUB_INT T4.W, T4.W, PS,
 ; EG-NEXT:     MULLO_INT * T0.Z, T0.X, T3.W,
 ; EG-NEXT:     SUB_INT T1.Y, T1.Z, PS,
 ; EG-NEXT:     ADD_INT T0.Z, T0.Y, 1,
-; EG-NEXT:     SETGE_UINT T6.W, PV.W, T1.W,
-; EG-NEXT:     SUB_INT * T7.W, PV.W, T1.W,
+; EG-NEXT:     SETGE_UINT T6.W, PV.W, T0.W,
+; EG-NEXT:     SUB_INT * T7.W, PV.W, T0.W,
 ; EG-NEXT:     CNDE_INT T1.X, PV.W, T4.W, PS, BS:VEC_021/SCL_122
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PV.Z,
 ; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
@@ -665,9 +664,9 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; EG-NEXT:     CNDE_INT T1.Y, PV.W, T1.Y, PS,
 ; EG-NEXT:     CNDE_INT T0.Z, PV.W, T0.X, PV.Z,
 ; EG-NEXT:     ADD_INT T4.W, PV.Y, 1,
-; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.W,
+; EG-NEXT:     SETGE_UINT * T0.W, PV.X, T0.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PS, T0.Y, PV.W,
-; EG-NEXT:     XOR_INT T1.Z, T2.Z, T0.W, BS:VEC_021/SCL_122
+; EG-NEXT:     XOR_INT T1.Z, T2.Z, T1.W, BS:VEC_021/SCL_122
 ; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
 ; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T3.W,
 ; EG-NEXT:     CNDE_INT T0.Z, PS, T0.Z, PV.W,

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index dd65453e6fb3a..19f57b603723f 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -854,94 +854,92 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_sdiv24_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
 ; GCN-NEXT:    s_ashr_i64 s[8:9], s[8:9], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
-; GCN-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s10
+; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s6
 ; GCN-NEXT:    s_xor_b32 s4, s6, s10
-; GCN-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 24
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_sdiv24_v2i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GCN-IR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-IR-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
 ; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[8:9], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
-; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v2, s10
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, s6
 ; GCN-IR-NEXT:    s_xor_b32 s4, s6, s10
-; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-IR-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-IR-NEXT:    v_bfe_i32 v2, v2, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index 176f972e4d4f0..c7d6301de3419 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -216,16 +216,14 @@ define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
 }
 
 ; GCN-LABEL: {{^}}s_select_v2f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
+; GCN-DAG: s_load_dwordx4 s{{\[}}[[ALO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
 ; GCN-DAG: v_cndmask_b32_e32
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN: buffer_store_dwordx2
 define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
@@ -251,8 +249,7 @@ define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x f
 }
 
 ; GCN-LABEL: {{^}}s_select_v4f32:
-; GCN: s_load_dwordx4
-; GCN: s_load_dwordx4
+; GCN: s_load_dwordx8
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
 ; GCN: v_cndmask_b32_e32

diff  --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 206c62d683dee..b6457168d4eb2 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -451,64 +451,63 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_shl_v2i128ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
-; GCN-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v10, 16
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[8:9], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[10:11], 0
-; GCN-NEXT:    s_sub_i32 s6, 64, s8
-; GCN-NEXT:    s_sub_i32 s4, s8, 64
-; GCN-NEXT:    s_lshr_b64 s[6:7], s[16:17], s6
-; GCN-NEXT:    s_lshl_b64 s[24:25], s[18:19], s8
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[16:17], s4
-; GCN-NEXT:    s_or_b64 s[6:7], s[24:25], s[6:7]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
+; GCN-NEXT:    s_sub_i32 s22, 64, s12
+; GCN-NEXT:    s_sub_i32 s20, s12, 64
+; GCN-NEXT:    s_lshr_b64 s[22:23], s[4:5], s22
+; GCN-NEXT:    s_lshl_b64 s[24:25], s[6:7], s12
+; GCN-NEXT:    s_lshl_b64 s[20:21], s[4:5], s20
+; GCN-NEXT:    s_or_b64 s[22:23], s[24:25], s[22:23]
 ; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_or_b64 s[0:1], s[12:13], s[14:15]
+; GCN-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v1, s22
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s18
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
-; GCN-NEXT:    s_sub_i32 s6, 64, s12
-; GCN-NEXT:    s_sub_i32 s4, s12, 64
-; GCN-NEXT:    s_lshr_b64 s[6:7], s[20:21], s6
-; GCN-NEXT:    s_lshl_b64 s[10:11], s[22:23], s12
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[20:21], s4
-; GCN-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
+; GCN-NEXT:    s_sub_i32 s13, 64, s16
+; GCN-NEXT:    s_sub_i32 s6, s16, 64
+; GCN-NEXT:    s_lshr_b64 s[14:15], s[8:9], s13
+; GCN-NEXT:    s_lshl_b64 s[20:21], s[10:11], s16
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[8:9], s6
+; GCN-NEXT:    s_or_b64 s[14:15], s[20:21], s[14:15]
 ; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[2:3], s[12:13], s[14:15]
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NEXT:    v_mov_b32_e32 v1, s11
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v0, v1, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v1, s22
+; GCN-NEXT:    v_mov_b32_e32 v1, s10
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v0, v1, s[2:3]
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[16:17], s8
+; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], s12
 ; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[20:21], s12
+; GCN-NEXT:    s_lshl_b64 s[2:3], s[8:9], s16
 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
@@ -521,64 +520,63 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_lshr_v2i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
-; GCN-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v10, 16
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[8:9], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[10:11], 0
-; GCN-NEXT:    s_sub_i32 s6, 64, s8
-; GCN-NEXT:    s_sub_i32 s4, s8, 64
-; GCN-NEXT:    s_lshl_b64 s[6:7], s[18:19], s6
-; GCN-NEXT:    s_lshr_b64 s[24:25], s[16:17], s8
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[18:19], s4
-; GCN-NEXT:    s_or_b64 s[6:7], s[24:25], s[6:7]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
+; GCN-NEXT:    s_sub_i32 s22, 64, s12
+; GCN-NEXT:    s_sub_i32 s20, s12, 64
+; GCN-NEXT:    s_lshl_b64 s[22:23], s[6:7], s22
+; GCN-NEXT:    s_lshr_b64 s[24:25], s[4:5], s12
+; GCN-NEXT:    s_lshr_b64 s[20:21], s[6:7], s20
+; GCN-NEXT:    s_or_b64 s[22:23], s[24:25], s[22:23]
 ; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_or_b64 s[0:1], s[12:13], s[14:15]
+; GCN-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s16
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
-; GCN-NEXT:    s_sub_i32 s6, 64, s12
-; GCN-NEXT:    s_sub_i32 s4, s12, 64
-; GCN-NEXT:    s_lshl_b64 s[6:7], s[22:23], s6
-; GCN-NEXT:    s_lshr_b64 s[10:11], s[20:21], s12
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[22:23], s4
-; GCN-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
+; GCN-NEXT:    s_sub_i32 s13, 64, s16
+; GCN-NEXT:    s_sub_i32 s4, s16, 64
+; GCN-NEXT:    s_lshl_b64 s[14:15], s[10:11], s13
+; GCN-NEXT:    s_lshr_b64 s[20:21], s[8:9], s16
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
+; GCN-NEXT:    s_or_b64 s[14:15], s[20:21], s[14:15]
 ; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[2:3], s[12:13], s[14:15]
+; GCN-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s14
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v2, v3, s[2:3]
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[18:19], s8
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[6:7], s12
 ; GCN-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[22:23], s12
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[10:11], s16
 ; GCN-NEXT:    v_mov_b32_e32 v6, s3
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v6, s2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
@@ -591,61 +589,61 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_ashr_v2i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
-; GCN-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v10, 16
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[8:9], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[10:11], 0
-; GCN-NEXT:    s_sub_i32 s6, 64, s8
-; GCN-NEXT:    s_sub_i32 s4, s8, 64
-; GCN-NEXT:    s_lshl_b64 s[6:7], s[18:19], s6
-; GCN-NEXT:    s_lshr_b64 s[24:25], s[16:17], s8
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[18:19], s4
-; GCN-NEXT:    s_or_b64 s[6:7], s[24:25], s[6:7]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
+; GCN-NEXT:    s_sub_i32 s22, 64, s12
+; GCN-NEXT:    s_sub_i32 s20, s12, 64
+; GCN-NEXT:    s_lshl_b64 s[22:23], s[6:7], s22
+; GCN-NEXT:    s_lshr_b64 s[24:25], s[4:5], s12
+; GCN-NEXT:    s_ashr_i64 s[20:21], s[6:7], s20
+; GCN-NEXT:    s_or_b64 s[22:23], s[24:25], s[22:23]
 ; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_or_b64 s[0:1], s[12:13], s[14:15]
+; GCN-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s16
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
-; GCN-NEXT:    s_sub_i32 s6, 64, s12
-; GCN-NEXT:    s_sub_i32 s4, s12, 64
-; GCN-NEXT:    s_lshl_b64 s[6:7], s[22:23], s6
-; GCN-NEXT:    s_lshr_b64 s[10:11], s[20:21], s12
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[22:23], s4
-; GCN-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
+; GCN-NEXT:    s_sub_i32 s13, 64, s16
+; GCN-NEXT:    s_sub_i32 s4, s16, 64
+; GCN-NEXT:    s_lshl_b64 s[14:15], s[10:11], s13
+; GCN-NEXT:    s_lshr_b64 s[20:21], s[8:9], s16
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[10:11], s4
+; GCN-NEXT:    s_or_b64 s[14:15], s[20:21], s[14:15]
 ; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[2:3], s[12:13], s[14:15]
+; GCN-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s14
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v2, v3, s[2:3]
-; GCN-NEXT:    s_ashr_i32 s4, s19, 31
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[18:19], s8
+; GCN-NEXT:    s_ashr_i32 s4, s7, 31
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[6:7], s12
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    v_mov_b32_e32 v6, s2
-; GCN-NEXT:    s_ashr_i32 s4, s23, 31
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[22:23], s12
+; GCN-NEXT:    s_ashr_i32 s4, s11, 31
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[10:11], s16
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v6, s4
@@ -653,7 +651,6 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v6, v7, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 43e15f1f58d12..dbe7fb6cc0859 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -31,8 +31,7 @@ define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x8
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -45,20 +44,19 @@ define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
 ;
 ; EG-LABEL: shl_v2i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
-; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHL * T0.Y, T0.Y, T1.Y,
-; EG-NEXT:     LSHL T0.X, T0.X, T1.X,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHL * T0.Y, T0.Y, T0.W,
+; EG-NEXT:     LSHL T0.X, T0.X, T0.Z,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -94,22 +92,21 @@ define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
 ;
 ; VI-LABEL: shl_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x10
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s7, s7, s11
-; VI-NEXT:    s_lshl_b32 s6, s6, s10
-; VI-NEXT:    s_lshl_b32 s5, s5, s9
-; VI-NEXT:    s_lshl_b32 s4, s4, s8
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_lshl_b32 s3, s3, s7
+; VI-NEXT:    s_lshl_b32 s2, s2, s6
+; VI-NEXT:    s_lshl_b32 s1, s1, s5
+; VI-NEXT:    s_lshl_b32 s0, s0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: shl_v4i32:
@@ -637,31 +634,29 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v5, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v4, v9, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v5, v8, v6
-; SI-NEXT:    v_and_b32_e32 v3, s4, v3
-; SI-NEXT:    v_and_b32_e32 v2, s4, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_or_b32_e32 v2, v2, v5
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, v9, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v3, v8, v6
+; SI-NEXT:    v_and_b32_e32 v1, s4, v1
+; SI-NEXT:    v_and_b32_e32 v0, s4, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v3
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_v4i16:
@@ -672,10 +667,7 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -691,45 +683,40 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ;
 ; EG-LABEL: shl_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 3, @15, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @10
-; EG-NEXT:    ALU 49, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 53, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 8:
-; EG-NEXT:     VTX_READ_64 T10.XY, T0.X, 0, #1
-; EG-NEXT:    Fetch clause starting at 10:
-; EG-NEXT:     VTX_READ_64 T10.XY, T0.X, 8, #1
-; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:    ALU clause starting at 11:
 ; EG-NEXT:     MOV T4.X, T10.X,
 ; EG-NEXT:     MOV * T5.X, T10.Y,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     MOV * T0.Z, PS,
-; EG-NEXT:    ALU clause starting at 19:
-; EG-NEXT:     MOV T2.X, T10.X,
-; EG-NEXT:     MOV * T3.X, T10.Y,
-; EG-NEXT:     MOV T0.X, T6.X,
-; EG-NEXT:     MOV * T1.Y, PV.X,
+; EG-NEXT:     MOV T0.X, PV.X,
+; EG-NEXT:     MOV T0.Y, PS,
+; EG-NEXT:     MOV * T2.X, T10.Z,
+; EG-NEXT:     MOV T3.X, T10.W,
+; EG-NEXT:     MOV * T0.Z, T6.X,
+; EG-NEXT:     MOV * T1.Y, T2.X,
 ; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T0.Y, literal.x,
+; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     LSHL * T1.W, PS, PV.W,
 ; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
+; EG-NEXT:     AND_INT * T2.W, T0.Z, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), -65536(nan)
 ; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
-; EG-NEXT:     MOV T0.X, T3.X,
-; EG-NEXT:     MOV * T6.X, PV.W,
-; EG-NEXT:     MOV T1.Z, PS,
+; EG-NEXT:     MOV * T0.Z, T3.X,
+; EG-NEXT:     MOV * T6.X, T1.W,
+; EG-NEXT:     MOV T1.Z, PV.X,
 ; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
-; EG-NEXT:     LSHR * T2.W, T0.Y, literal.x,
+; EG-NEXT:     LSHR * T2.W, T0.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     LSHL T1.W, PS, PV.W,
 ; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.x,
@@ -738,23 +725,23 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
 ; EG-NEXT:     MOV T6.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T7.X,
-; EG-NEXT:     AND_INT T1.W, T0.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     AND_INT * T2.W, T0.Z, literal.x,
+; EG-NEXT:     MOV * T0.X, T7.X,
+; EG-NEXT:     AND_INT T1.W, T0.Z, literal.x,
+; EG-NEXT:     AND_INT * T2.W, T0.Y, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     LSHL T1.W, PS, PV.W,
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
 ; EG-NEXT:    -65536(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
 ; EG-NEXT:     MOV * T7.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR T1.W, T0.X, literal.x,
-; EG-NEXT:     LSHR * T2.W, T0.Z, literal.x,
+; EG-NEXT:     MOV T0.X, PV.X,
+; EG-NEXT:     LSHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     LSHR * T2.W, T0.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     LSHL * T1.W, PS, PV.W,
-; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
+; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x,
 ; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
@@ -864,20 +851,19 @@ define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> add
 ;
 ; VI-LABEL: shl_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x10
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s10
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: shl_v2i64:
@@ -956,8 +942,7 @@ define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx8 s[0:7], s[18:19], 0x0
-; VI-NEXT:    s_load_dwordx8 s[8:15], s[18:19], 0x20
+; VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; VI-NEXT:    s_mov_b32 s19, 0xf000
 ; VI-NEXT:    s_mov_b32 s18, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 0134ff6c319be..08b045e4a623f 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -7,8 +7,7 @@
 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x30
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -20,29 +19,27 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ;
 ; VI-LABEL: s_shl_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x30
-; VI-NEXT:    s_mov_b32 s1, 0xffff
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s6, 0xffff
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s3, s2, s1
-; VI-NEXT:    s_lshr_b32 s2, s2, 16
-; VI-NEXT:    s_lshr_b32 s8, s0, 16
-; VI-NEXT:    s_lshl_b32 s2, s2, s8
-; VI-NEXT:    s_lshl_b32 s0, s3, s0
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s0, s0, s1
-; VI-NEXT:    s_or_b32 s0, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_and_b32 s7, s4, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s8, s5, 16
+; VI-NEXT:    s_lshl_b32 s4, s4, s8
+; VI-NEXT:    s_lshl_b32 s5, s7, s5
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s5, s5, s6
+; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_shl_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s4, s[0:1], 0xb
-; CI-NEXT:    s_load_dword s5, s[0:1], 0xc
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
@@ -60,9 +57,8 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ;
 ; GFX10-LABEL: s_shl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x30
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s6, -1
@@ -79,35 +75,31 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
 ; GFX9-LABEL: v_shl_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_shl_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v5
-; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshlrev_b16_e32 v4, v1, v0
+; VI-NEXT:    v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_shl_v2i16:
@@ -119,17 +111,15 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, v5, v4
-; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; CI-NEXT:    s_endpgm
@@ -137,14 +127,12 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
 ; GFX10-LABEL: v_shl_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -464,8 +452,7 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
@@ -480,10 +467,7 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -502,31 +486,29 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, 0
-; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    v_mov_b32_e32 v5, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
 ; CI-NEXT:    s_mov_b32 s4, 0xffff
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; CI-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; CI-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v4, v9, v7
-; CI-NEXT:    v_lshlrev_b32_e32 v5, v8, v6
-; CI-NEXT:    v_and_b32_e32 v3, s4, v3
-; CI-NEXT:    v_and_b32_e32 v2, s4, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; CI-NEXT:    v_or_b32_e32 v3, v3, v4
-; CI-NEXT:    v_or_b32_e32 v2, v2, v5
-; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; CI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
+; CI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v2, v9, v7
+; CI-NEXT:    v_lshlrev_b32_e32 v3, v8, v6
+; CI-NEXT:    v_and_b32_e32 v1, s4, v1
+; CI-NEXT:    v_and_b32_e32 v0, s4, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_or_b32_e32 v1, v1, v2
+; CI-NEXT:    v_or_b32_e32 v0, v0, v3
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_shl_v4i16:
@@ -534,9 +516,7 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 491b1c742b8b3..090ac93961235 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -46,20 +46,19 @@ define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ;
 ; EG-LABEL: ashr_v2i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
-; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     ASHR * T0.Y, T0.Y, T1.Y,
-; EG-NEXT:     ASHR T0.X, T0.X, T1.X,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     ASHR * T0.Y, T0.Y, T0.W,
+; EG-NEXT:     ASHR T0.X, T0.X, T0.Z,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -193,25 +192,23 @@ define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ad
 ;
 ; EG-LABEL: ashr_v2i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 14, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
-; EG-NEXT:     VTX_READ_32 T6.X, T6.X, 4, #1
-; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     VTX_READ_64 T6.XY, T6.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T6.X, KC0[2].Z,
-; EG-NEXT:     MOV * T7.X, PV.X,
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR * T0.W, T6.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.Z, T6.X, literal.x,
-; EG-NEXT:     BFE_INT T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     AND_INT * T1.W, T6.X, literal.y,
+; EG-NEXT:     LSHR T0.Z, T6.Y, literal.x,
+; EG-NEXT:     BFE_INT T0.W, T6.X, 0.0, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T6.Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
 ; EG-NEXT:     ASHR T0.W, PV.W, PS,
 ; EG-NEXT:     ASHR * T1.W, PV.Y, PV.Z,
@@ -253,10 +250,10 @@ define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ad
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; SI-NEXT:    v_ashrrev_i32_e32 v1, v7, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v3, v3, v5
-; SI-NEXT:    v_ashrrev_i32_e32 v0, v6, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v2, v2, v4
+; SI-NEXT:    v_ashr_i32_e32 v1, v1, v7
+; SI-NEXT:    v_ashr_i32_e32 v3, v5, v3
+; SI-NEXT:    v_ashr_i32_e32 v0, v0, v6
+; SI-NEXT:    v_ashr_i32_e32 v2, v4, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v3, s2, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -291,28 +288,23 @@ define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ad
 ;
 ; EG-LABEL: ashr_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 3, @13, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @10
-; EG-NEXT:    ALU 54, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 58, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 8:
-; EG-NEXT:     VTX_READ_64 T10.XY, T9.X, 0, #1
-; EG-NEXT:    Fetch clause starting at 10:
-; EG-NEXT:     VTX_READ_64 T9.XY, T9.X, 8, #1
-; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T9.XYZW, T9.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T9.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 13:
-; EG-NEXT:     MOV T4.X, T10.X,
-; EG-NEXT:     MOV * T5.X, T10.Y,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     MOV T4.X, T9.X,
+; EG-NEXT:     MOV * T5.X, T9.Y,
 ; EG-NEXT:     MOV T0.Y, PV.X,
 ; EG-NEXT:     MOV * T0.Z, PS,
-; EG-NEXT:    ALU clause starting at 17:
-; EG-NEXT:     MOV T2.X, T9.X,
-; EG-NEXT:     MOV * T3.X, T9.Y,
+; EG-NEXT:     MOV T2.X, T9.Z,
+; EG-NEXT:     MOV * T3.X, T9.W,
 ; EG-NEXT:     MOV * T0.W, T6.X,
 ; EG-NEXT:     MOV T1.Y, T2.X,
 ; EG-NEXT:     BFE_INT * T1.W, T0.Y, 0.0, literal.x,

diff  --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 88cd9b00c9859..f2d392144344c 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -85,8 +85,7 @@ define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x8
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -99,20 +98,19 @@ define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ;
 ; EG-LABEL: lshr_v2i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
-; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR * T0.Y, T0.Y, T1.Y,
-; EG-NEXT:     LSHR T0.X, T0.X, T1.X,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR * T0.Y, T0.Y, T0.W,
+; EG-NEXT:     LSHR T0.X, T0.X, T0.Z,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -148,22 +146,21 @@ define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ;
 ; VI-LABEL: lshr_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x10
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s7, s7, s11
-; VI-NEXT:    s_lshr_b32 s6, s6, s10
-; VI-NEXT:    s_lshr_b32 s5, s5, s9
-; VI-NEXT:    s_lshr_b32 s4, s4, s8
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_lshr_b32 s3, s3, s7
+; VI-NEXT:    s_lshr_b32 s2, s2, s6
+; VI-NEXT:    s_lshr_b32 s1, s1, s5
+; VI-NEXT:    s_lshr_b32 s0, s0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: lshr_v4i32:
@@ -289,8 +286,7 @@ define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx8 s[0:7], s[18:19], 0x0
-; VI-NEXT:    s_load_dwordx8 s[8:15], s[18:19], 0x20
+; VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; VI-NEXT:    s_mov_b32 s19, 0xf000
 ; VI-NEXT:    s_mov_b32 s18, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index a229ce1146aba..4c5e61ca626ce 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -165,21 +165,19 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <
 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x30
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_pk_sub_i16 v0, s3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_sub_v2i16_kernarg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT:    s_load_dword s5, s[0:1], 0x30
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
@@ -197,9 +195,8 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out
 ;
 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x30
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s6, -1

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index d678b64f54b69..574c7671de936 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -547,50 +547,49 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ;
 ; EG-LABEL: udiv_v2i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 33, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
-; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
-; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     SUB_INT T0.W, 0.0, T1.Y,
-; EG-NEXT:     RECIP_UINT * T0.Z, T1.Y,
-; EG-NEXT:     MULLO_INT * T0.W, PV.W, PS,
-; EG-NEXT:     SUB_INT T1.W, 0.0, T1.X,
-; EG-NEXT:     RECIP_UINT * T1.Z, T1.X,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     SUB_INT T1.W, 0.0, T0.W,
+; EG-NEXT:     RECIP_UINT * T1.X, T0.W,
+; EG-NEXT:     MULLO_INT * T1.Y, PV.W, PS,
+; EG-NEXT:     SUB_INT T1.W, 0.0, T0.Z,
+; EG-NEXT:     RECIP_UINT * T1.Z, T0.Z,
 ; EG-NEXT:     MULLO_INT * T1.W, PV.W, PS,
 ; EG-NEXT:     MULHI * T1.W, T1.Z, PS,
 ; EG-NEXT:     ADD_INT T1.W, T1.Z, PS,
-; EG-NEXT:     MULHI * T0.W, T0.Z, T0.W,
-; EG-NEXT:     ADD_INT T0.W, T0.Z, PS,
-; EG-NEXT:     MULHI * T0.Z, T0.X, PV.W,
-; EG-NEXT:     MULHI * T0.W, T0.Y, PV.W,
-; EG-NEXT:     MULLO_INT * T1.Z, PS, T1.Y,
+; EG-NEXT:     MULHI * T1.Y, T1.X, T1.Y,
+; EG-NEXT:     ADD_INT T2.W, T1.X, PS,
+; EG-NEXT:     MULHI * T1.X, T0.X, PV.W,
+; EG-NEXT:     MULHI * T1.Y, T0.Y, PV.W,
+; EG-NEXT:     MULLO_INT * T1.Z, PS, T0.W,
 ; EG-NEXT:     SUB_INT T1.W, T0.Y, PS,
-; EG-NEXT:     MULLO_INT * T0.Y, T0.Z, T1.X,
+; EG-NEXT:     MULLO_INT * T0.Y, T1.X, T0.Z,
 ; EG-NEXT:     SUB_INT T0.Y, T0.X, PS,
-; EG-NEXT:     ADD_INT T1.Z, T0.W, 1,
-; EG-NEXT:     SETGE_UINT T2.W, PV.W, T1.Y,
-; EG-NEXT:     SUB_INT * T3.W, PV.W, T1.Y,
+; EG-NEXT:     ADD_INT T1.Z, T1.Y, 1,
+; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
+; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
 ; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.W, PS,
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, T0.W, PV.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     ADD_INT T1.Z, T0.Z, 1,
-; EG-NEXT:     SETGE_UINT T0.W, PV.Y, T1.X,
-; EG-NEXT:     SUB_INT * T1.W, PV.Y, T1.X,
-; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T0.Z, PV.W, T0.Z, PV.Z,
-; EG-NEXT:     ADD_INT T0.W, PV.Y, 1,
-; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.Y,
-; EG-NEXT:     CNDE_INT T1.Y, PS, T2.Y, PV.W,
+; EG-NEXT:     CNDE_INT T1.Y, PV.W, T1.Y, PV.Z,
+; EG-NEXT:     ADD_INT T1.Z, T1.X, 1,
+; EG-NEXT:     SETGE_UINT T1.W, PV.Y, T0.Z,
+; EG-NEXT:     SUB_INT * T2.W, PV.Y, T0.Z,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PS,
+; EG-NEXT:     CNDE_INT T1.Z, PV.W, T1.X, PV.Z,
+; EG-NEXT:     ADD_INT T1.W, PV.Y, 1,
+; EG-NEXT:     SETGE_UINT * T0.W, PV.X, T0.W,
+; EG-NEXT:     CNDE_INT T1.Y, PS, T1.Y, PV.W,
 ; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
-; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T1.X,
-; EG-NEXT:     CNDE_INT T1.X, PS, T0.Z, PV.W,
+; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T0.Z,
+; EG-NEXT:     CNDE_INT T1.X, PS, T1.Z, PV.W,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 00bae7afa10fa..d7d5546e93fa9 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -491,86 +491,84 @@ define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_urem31_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s5, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-NEXT:    s_lshr_b32 s1, s1, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GCN-NEXT:    s_lshr_b32 s2, s3, 1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_lshr_b32 s3, s7, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
-; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-NEXT:    s_lshr_b32 s4, s5, 1
+; GCN-NEXT:    s_lshr_b32 s5, s9, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GCN-NEXT:    s_lshr_b32 s6, s11, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GCN-NEXT:    s_lshr_b32 s7, s7, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GCN-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GCN-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
-; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v3
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_brev_b32 s0, -2
-; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
-; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GCN-NEXT:    v_mad_f32 v2, -v2, v3, v5
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s6
+; GCN-NEXT:    s_brev_b32 s4, -2
+; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
+; GCN-NEXT:    v_and_b32_e32 v2, s4, v2
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem31_v2i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
-; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_lshr_b32 s0, s5, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-IR-NEXT:    s_lshr_b32 s1, s1, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_lshr_b32 s3, s7, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s2
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-IR-NEXT:    s_lshr_b32 s4, s5, 1
+; GCN-IR-NEXT:    s_lshr_b32 s5, s9, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GCN-IR-NEXT:    s_lshr_b32 s7, s7, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GCN-IR-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GCN-IR-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v5, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
-; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v4, v3
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    s_brev_b32 s0, -2
-; GCN-IR-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v3, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s6
+; GCN-IR-NEXT:    s_brev_b32 s4, -2
+; GCN-IR-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, s4, v2
+; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 33>
   %2 = lshr <2 x i64> %y, <i64 33, i64 33>
@@ -639,86 +637,84 @@ define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_urem23_64_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s5, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-NEXT:    s_lshr_b32 s1, s1, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GCN-NEXT:    s_lshr_b32 s2, s3, 9
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_lshr_b32 s3, s7, 9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
-; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-NEXT:    s_lshr_b32 s4, s5, 1
+; GCN-NEXT:    s_lshr_b32 s5, s9, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GCN-NEXT:    s_lshr_b32 s6, s11, 9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GCN-NEXT:    s_lshr_b32 s7, s7, 9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GCN-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GCN-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
-; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v3
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_brev_b32 s0, -2
-; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
-; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GCN-NEXT:    v_mad_f32 v2, -v2, v3, v5
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s6
+; GCN-NEXT:    s_brev_b32 s4, -2
+; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
+; GCN-NEXT:    v_and_b32_e32 v2, s4, v2
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem23_64_v2i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
-; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_lshr_b32 s0, s5, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-IR-NEXT:    s_lshr_b32 s1, s1, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 9
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_lshr_b32 s3, s7, 9
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s2
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-IR-NEXT:    s_lshr_b32 s4, s5, 1
+; GCN-IR-NEXT:    s_lshr_b32 s5, s9, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 9
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GCN-IR-NEXT:    s_lshr_b32 s7, s7, 9
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GCN-IR-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GCN-IR-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v5, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
-; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v4, v3
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    s_brev_b32 s0, -2
-; GCN-IR-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v3, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s6
+; GCN-IR-NEXT:    s_brev_b32 s4, -2
+; GCN-IR-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, s4, v2
+; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 41>
   %2 = lshr <2 x i64> %y, <i64 33, i64 41>

diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll
new file mode 100644
index 0000000000000..41b10f087caa9
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll
@@ -0,0 +1,96 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -load-store-vectorizer -S -o - %s | FileCheck %s
+
+define void @ldg_f16(half* nocapture align 16 %rd0) {
+  %in1b = bitcast half* %rd0 to <2 x half>*
+  %load1 = load <2 x half>, <2 x half>* %in1b, align 4
+  %p1 = fcmp ogt <2 x half> %load1, zeroinitializer
+  %s1 = select <2 x i1> %p1, <2 x half> %load1, <2 x half> zeroinitializer
+  store <2 x half> %s1, <2 x half>* %in1b, align 4
+  %in2 = getelementptr half, half* %rd0, i64 2
+  %in2b = bitcast half* %in2 to <2 x half>*
+  %load2 = load <2 x half>, <2 x half>* %in2b, align 4
+  %p2 = fcmp ogt <2 x half> %load2, zeroinitializer
+  %s2 = select <2 x i1> %p2, <2 x half> %load2, <2 x half> zeroinitializer
+  store <2 x half> %s2, <2 x half>* %in2b, align 4
+  %in3 = getelementptr half, half* %rd0, i64 4
+  %in3b = bitcast half* %in3 to <2 x half>*
+  %load3 = load <2 x half>, <2 x half>* %in3b, align 4
+  %p3 = fcmp ogt <2 x half> %load3, zeroinitializer
+  %s3 = select <2 x i1> %p3, <2 x half> %load3, <2 x half> zeroinitializer
+  store <2 x half> %s3, <2 x half>* %in3b, align 4
+  %in4 = getelementptr half, half* %rd0, i64 6
+  %in4b = bitcast half* %in4 to <2 x half>*
+  %load4 = load <2 x half>, <2 x half>* %in4b, align 4
+  %p4 = fcmp ogt <2 x half> %load4, zeroinitializer
+  %s4 = select <2 x i1> %p4, <2 x half> %load4, <2 x half> zeroinitializer
+  store <2 x half> %s4, <2 x half>* %in4b, align 4
+  ret void
+
+; CHECK-LABEL: @ldg_f16
+; CHECK: %[[LD:.*]] = load <8 x half>, <8 x half>*
+; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 4, i32 5>
+; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 6, i32 7>
+; CHECK: store <8 x half>
+}
+
+define void @no_nonpow2_vector(half* nocapture align 16 %rd0) {
+  %in1b = bitcast half* %rd0 to <3 x half>*
+  %load1 = load <3 x half>, <3 x half>* %in1b, align 4
+  %p1 = fcmp ogt <3 x half> %load1, zeroinitializer
+  %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
+  store <3 x half> %s1, <3 x half>* %in1b, align 4
+  %in2 = getelementptr half, half* %rd0, i64 3
+  %in2b = bitcast half* %in2 to <3 x half>*
+  %load2 = load <3 x half>, <3 x half>* %in2b, align 4
+  %p2 = fcmp ogt <3 x half> %load2, zeroinitializer
+  %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
+  store <3 x half> %s2, <3 x half>* %in2b, align 4
+  %in3 = getelementptr half, half* %rd0, i64 6
+  %in3b = bitcast half* %in3 to <3 x half>*
+  %load3 = load <3 x half>, <3 x half>* %in3b, align 4
+  %p3 = fcmp ogt <3 x half> %load3, zeroinitializer
+  %s3 = select <3 x i1> %p3, <3 x half> %load3, <3 x half> zeroinitializer
+  store <3 x half> %s3, <3 x half>* %in3b, align 4
+  %in4 = getelementptr half, half* %rd0, i64 9
+  %in4b = bitcast half* %in4 to <3 x half>*
+  %load4 = load <3 x half>, <3 x half>* %in4b, align 4
+  %p4 = fcmp ogt <3 x half> %load4, zeroinitializer
+  %s4 = select <3 x i1> %p4, <3 x half> %load4, <3 x half> zeroinitializer
+  store <3 x half> %s4, <3 x half>* %in4b, align 4
+  ret void
+
+; CHECK-LABEL: @no_nonpow2_vector
+; CHECK-NOT: shufflevector
+}
+
+define void @no_pointer_vector(half** nocapture align 16 %rd0) {
+  %in1b = bitcast half** %rd0 to <2 x half*>*
+  %load1 = load <2 x half*>, <2 x half*>* %in1b, align 4
+  %p1 = icmp ne <2 x half*> %load1, zeroinitializer
+  %s1 = select <2 x i1> %p1, <2 x half*> %load1, <2 x half*> zeroinitializer
+  store <2 x half*> %s1, <2 x half*>* %in1b, align 4
+  %in2 = getelementptr half*, half** %rd0, i64 2
+  %in2b = bitcast half** %in2 to <2 x half*>*
+  %load2 = load <2 x half*>, <2 x half*>* %in2b, align 4
+  %p2 = icmp ne <2 x half*> %load2, zeroinitializer
+  %s2 = select <2 x i1> %p2, <2 x half*> %load2, <2 x half*> zeroinitializer
+  store <2 x half*> %s2, <2 x half*>* %in2b, align 4
+  %in3 = getelementptr half*, half** %rd0, i64 4
+  %in3b = bitcast half** %in3 to <2 x half*>*
+  %load3 = load <2 x half*>, <2 x half*>* %in3b, align 4
+  %p3 = icmp ne <2 x half*> %load3, zeroinitializer
+  %s3 = select <2 x i1> %p3, <2 x half*> %load3, <2 x half*> zeroinitializer
+  store <2 x half*> %s3, <2 x half*>* %in3b, align 4
+  %in4 = getelementptr half*, half** %rd0, i64 6
+  %in4b = bitcast half** %in4 to <2 x half*>*
+  %load4 = load <2 x half*>, <2 x half*>* %in4b, align 4
+  %p4 = icmp ne <2 x half*> %load4, zeroinitializer
+  %s4 = select <2 x i1> %p4, <2 x half*> %load4, <2 x half*> zeroinitializer
+  store <2 x half*> %s4, <2 x half*>* %in4b, align 4
+  ret void
+
+; CHECK-LABEL: @no_pointer_vector
+; CHECK-NOT: shufflevector
+}


        


More information about the llvm-commits mailing list