[llvm] 10cef70 - AMDGPU: Clean up LDS-related occupancy calculations

Nicolai Hähnle via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 23 12:43:33 PST 2023


Author: Nicolai Hähnle
Date: 2023-01-23T21:43:06+01:00
New Revision: 10cef708a7ccaf69c18be86460583f9b62ee3c29

URL: https://github.com/llvm/llvm-project/commit/10cef708a7ccaf69c18be86460583f9b62ee3c29
DIFF: https://github.com/llvm/llvm-project/commit/10cef708a7ccaf69c18be86460583f9b62ee3c29.diff

LOG: AMDGPU: Clean up LDS-related occupancy calculations

Occupancy is expressed as waves per SIMD. This means that we need to
take into account the number of SIMDs per "CU" or, to be more precise,
the number of SIMDs over which a workgroup may be distributed.

getOccupancyWithLocalMemSize was wrong because it didn't take SIMDs
into account at all.

At the same time, we need to take into account that WGP mode offers
access to a larger total amount of LDS, since this can affect how
non-power-of-two LDS allocations are rounded. To make this work
consistently, we distinguish between (available) local memory size and
addressable local memory size (which is always limited by 64kB on
gfx10+, even with WGP mode).

This change results in a massive amount of test churn. A lot of it is
caused by the fact that the default work group size is 1024, which means
that (due to rounding effects) the default occupancy on older hardware
is 8 instead of 10, which affects scheduling via register pressure
estimates. I've adjusted most tests by just running the UTC tools, but
in some cases I manually changed the work group size to 32 or 64 to make
sure that work group size chunkiness has no effect.

Differential Revision: https://reviews.llvm.org/D139468

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/lib/Target/AMDGPU/R600Subtarget.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
    llvm/test/CodeGen/AMDGPU/bf16.ll
    llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
    llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
    llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
    llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
    llvm/test/CodeGen/AMDGPU/half.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
    llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
    llvm/test/CodeGen/AMDGPU/memory_clause.mir
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
    llvm/test/CodeGen/AMDGPU/pr51516.mir
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
    llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
    llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
    llvm/test/CodeGen/AMDGPU/sdiv.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/shift-i128.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/sra.ll
    llvm/test/CodeGen/AMDGPU/srl.ll
    llvm/test/CodeGen/AMDGPU/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll
    llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d3e21f6ff6c15..c916d5d547c44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -842,11 +842,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     Ctx.diagnose(Diag);
   }
 
-  if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
+  if (MFI->getLDSSize() >
+      static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
     LLVMContext &Ctx = MF.getFunction().getContext();
-    DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
-                                     MFI->getLDSSize(),
-                                     STM.getLocalMemorySize(), DS_Error);
+    DiagnosticInfoResourceLimit Diag(
+        MF.getFunction(), "local memory", MFI->getLDSSize(),
+        STM.getAddressableLocalMemorySize(), DS_Error);
     Ctx.diagnose(Diag);
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ec1613970463b..a7da4005e867e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -798,7 +798,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
     }
   }
 
-  LocalMemLimit = ST.getLocalMemorySize();
+  LocalMemLimit = ST.getAddressableLocalMemorySize();
   if (LocalMemLimit == 0)
     return false;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 4ddc8cf9dfe6d..03ccd563975fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -141,6 +141,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
       HasMovrel = true;
   }
 
+  AddressableLocalMemorySize = LocalMemorySize;
+
+  if (AMDGPU::isGFX10Plus(*this) &&
+      !getFeatureBits().test(AMDGPU::FeatureCuMode))
+    LocalMemorySize *= 2;
+
   // Don't crash on invalid devices.
   if (WavefrontSizeLog2 == 0)
     WavefrontSizeLog2 = 5;
@@ -304,19 +310,29 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
   }
 }
 
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
-  const Function &F) const {
-  if (NWaves == 1)
-    return getLocalMemorySize();
-  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
-  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
-  if (!WorkGroupsPerCu)
-    return 0;
-  unsigned MaxWaves = getMaxWavesPerEU();
-  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
+// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
+// allows the given function to achieve an occupancy of NWaves waves per
+// SIMD / EU, taking into account only the function's *maximum* workgroup size.
+unsigned
+AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+                                                 const Function &F) const {
+  const unsigned WaveSize = getWavefrontSize();
+  const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  const unsigned WavesPerWorkgroup =
+      std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
+
+  const unsigned WorkGroupsPerCU =
+      std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
+
+  return getLocalMemorySize() / WorkGroupsPerCU;
 }
 
 // FIXME: Should return min,max range.
+//
+// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
+// be achieved when only the given function is running on the machine; and
+// taking into account the overall number of wave slots, the (maximum) workgroup
+// size, and the per-workgroup LDS allocation size.
 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
   const Function &F) const {
   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
@@ -338,10 +354,13 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 
   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
 
-  // Round to the number of waves.
-  const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
+  // Round to the number of waves per CU.
+  const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
 
+  // Number of waves per EU (SIMD).
+  MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
+
   // Clamp to the maximum possible number of waves.
   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 99177ac07e2d1..972f996ad85aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -64,6 +64,7 @@ class AMDGPUSubtarget {
   unsigned EUsPerCU = 4;
   unsigned MaxWavesPerEU = 10;
   unsigned LocalMemorySize = 0;
+  unsigned AddressableLocalMemorySize = 0;
   char WavefrontSizeLog2 = 0;
 
 public:
@@ -210,6 +211,10 @@ class AMDGPUSubtarget {
     return LocalMemorySize;
   }
 
+  unsigned getAddressableLocalMemorySize() const {
+    return AddressableLocalMemorySize;
+  }
+
   /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
   /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
   /// CU mode into account.

diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index afbd87ff97bb7..6946a05bc551a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -903,10 +903,12 @@ void GCNSchedStage::checkScheduling() {
     return;
   }
 
+  unsigned TargetOccupancy =
+      std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
   unsigned WavesAfter =
-      std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST));
+      std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
   unsigned WavesBefore =
-      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+      std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                     << ", after " << WavesAfter << ".\n");
 

diff  --git a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
index d8f0610549045..e5a8c5cf3baf6 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
@@ -28,7 +28,9 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
       InstrInfo(*this),
       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
       TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
-      InstrItins(getInstrItineraryForCPU(GPU)) {}
+      InstrItins(getInstrItineraryForCPU(GPU)) {
+  AddressableLocalMemorySize = LocalMemorySize;
+}
 
 R600Subtarget &R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
                                                               StringRef GPU,

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3af17b6b5f1f5..e0ad11d5af24f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12635,7 +12635,8 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
       // We can report everything over the maximum size as 0. We can't report
       // based on the actual size because we don't know if it's accurate or not
       // at any given point.
-      Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize()));
+      Known.Zero.setHighBits(
+          countLeadingZeros(getSubtarget()->getAddressableLocalMemorySize()));
       break;
     }
     }

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index cf0f598bc2d01..4263e3e9eeac4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -828,11 +828,26 @@ unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
 }
 
 unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
+  unsigned BytesPerCU = 0;
+  if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
+    BytesPerCU = 32768;
+  if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
+    BytesPerCU = 65536;
+
+  // "Per CU" really means "per whatever functional block the waves of a
+  // workgroup must share". So the effective local memory size is doubled in
+  // WGP mode on gfx10.
+  if (isGFX10Plus(*STI) && !STI->getFeatureBits().test(FeatureCuMode))
+    BytesPerCU *= 2;
+
+  return BytesPerCU;
+}
+
+unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
     return 32768;
   if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
     return 65536;
-
   return 0;
 }
 
@@ -852,11 +867,18 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
   assert(FlatWorkGroupSize != 0);
   if (STI->getTargetTriple().getArch() != Triple::amdgcn)
     return 8;
+  unsigned MaxWaves = getMaxWavesPerEU(STI) * getEUsPerCU(STI);
   unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize);
-  if (N == 1)
-    return 40;
-  N = 40 / N;
-  return std::min(N, 16u);
+  if (N == 1) {
+    // Single-wave workgroups don't consume barrier resources.
+    return MaxWaves;
+  }
+
+  unsigned MaxBarriers = 16;
+  if (isGFX10Plus(*STI) && !STI->getFeatureBits().test(FeatureCuMode))
+    MaxBarriers = 32;
+
+  return std::min(MaxWaves / N, MaxBarriers);
 }
 
 unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d527199ce12db..4d3423592353e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -192,6 +192,10 @@ unsigned getWavefrontSize(const MCSubtargetInfo *STI);
 /// \returns Local memory size in bytes for given subtarget \p STI.
 unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
 
+/// \returns Maximum addressable local memory size in bytes for given subtarget
+/// \p STI.
+unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI);
+
 /// \returns Number of execution units per compute unit for given subtarget \p
 /// STI.
 unsigned getEUsPerCU(const MCSubtargetInfo *STI);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index e7350fec10546..0dcf986dacd60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -492,29 +492,29 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_ushort v14, v[0:1]
+; GFX8-NEXT:    flat_load_ushort v16, v[0:1]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 16, v4
+; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, 0, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_add_u16_e32 v1, v6, v10
 ; GFX8-NEXT:    v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v3, v7, v11
-; GFX8-NEXT:    v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u16_e32 v11, v8, v12
+; GFX8-NEXT:    v_add_u16_sdwa v6, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_u16_e32 v7, v8, v12
 ; GFX8-NEXT:    v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u16_e32 v12, v9, v13
+; GFX8-NEXT:    v_add_u16_e32 v10, v9, v13
 ; GFX8-NEXT:    v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 16, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v13, v14, v0
+; GFX8-NEXT:    v_add_u16_e32 v11, v16, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v10
-; GFX8-NEXT:    v_or_b32_e32 v2, v11, v8
-; GFX8-NEXT:    v_or_b32_e32 v3, v12, v9
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v6
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX8-NEXT:    v_or_b32_e32 v3, v10, v9
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    flat_store_short v[6:7], v13
+; GFX8-NEXT:    flat_store_short v[14:15], v11
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -640,55 +640,55 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX8-LABEL: add_v11i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 18, v0
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v18, v[10:11]
+; GFX8-NEXT:    flat_load_ushort v19, v[12:13]
+; GFX8-NEXT:    flat_load_ushort v20, v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 16, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 18, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 18, v2
-; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_ushort v14, v[14:15]
-; GFX8-NEXT:    flat_load_ushort v15, v[16:17]
-; GFX8-NEXT:    flat_load_ushort v16, v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    flat_load_ushort v1, v[14:15]
+; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 16, v4
+; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, 0, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 18, v4
+; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, 0, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u16_e32 v17, v6, v10
+; GFX8-NEXT:    v_add_u16_e32 v3, v6, v10
 ; GFX8-NEXT:    v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 18, v0
-; GFX8-NEXT:    v_add_u16_e32 v18, v7, v11
+; GFX8-NEXT:    v_add_u16_e32 v21, v7, v11
 ; GFX8-NEXT:    v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
-; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
-; GFX8-NEXT:    flat_load_ushort v3, v[6:7]
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_ushort v21, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 16, v4
+; GFX8-NEXT:    v_add_u16_e32 v22, v8, v12
+; GFX8-NEXT:    v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_u16_e32 v12, v9, v13
+; GFX8-NEXT:    v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX8-NEXT:    v_add_u16_e32 v19, v8, v12
-; GFX8-NEXT:    v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 18, v4
-; GFX8-NEXT:    v_add_u16_e32 v20, v9, v13
-; GFX8-NEXT:    v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v17, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v18, v11
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 20, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u16_e32 v14, v2, v14
+; GFX8-NEXT:    v_add_u16_e32 v13, v18, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u16_e32 v15, v3, v15
-; GFX8-NEXT:    v_or_b32_e32 v2, v19, v12
-; GFX8-NEXT:    v_or_b32_e32 v3, v20, v13
+; GFX8-NEXT:    v_add_u16_e32 v18, v19, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v16, v21, v16
+; GFX8-NEXT:    v_add_u16_e32 v19, v20, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v21, v11
+; GFX8-NEXT:    v_or_b32_e32 v2, v22, v8
+; GFX8-NEXT:    v_or_b32_e32 v3, v12, v9
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    flat_store_short v[6:7], v14
-; GFX8-NEXT:    flat_store_short v[8:9], v15
-; GFX8-NEXT:    flat_store_short v[10:11], v16
+; GFX8-NEXT:    flat_store_short v[14:15], v13
+; GFX8-NEXT:    flat_store_short v[16:17], v18
+; GFX8-NEXT:    flat_store_short v[6:7], v19
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -771,34 +771,34 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[2:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u16_e32 v2, v6, v10
-; GFX8-NEXT:    v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u16_e32 v10, v7, v11
-; GFX8-NEXT:    v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
-; GFX8-NEXT:    v_add_u16_e32 v16, v8, v12
-; GFX8-NEXT:    v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u16_e32 v12, v9, v13
+; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u16_e32 v0, v6, v10
+; GFX8-NEXT:    v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_u16_e32 v2, v7, v11
+; GFX8-NEXT:    v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_u16_e32 v6, v8, v12
+; GFX8-NEXT:    v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_u16_e32 v8, v9, v13
 ; GFX8-NEXT:    v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v10, v11
-; GFX8-NEXT:    v_or_b32_e32 v2, v16, v8
-; GFX8-NEXT:    v_or_b32_e32 v3, v12, v9
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v7
+; GFX8-NEXT:    v_or_b32_e32 v3, v8, v9
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u16_e32 v6, v14, v16
+; GFX8-NEXT:    v_add_u16_sdwa v7, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_u16_e32 v8, v15, v17
+; GFX8-NEXT:    v_add_u16_sdwa v9, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u16_e32 v8, v6, v14
-; GFX8-NEXT:    v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u16_e32 v9, v7, v15
-; GFX8-NEXT:    v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v9, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, v8, v9
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 5563c5fb41dc6..8b796bfde046c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -164,8 +164,6 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX9-NEXT:    v_add_u32_e32 v16, 1, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 7, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v3, v5, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v4, v6, s[4:5]
@@ -187,7 +185,6 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
 ; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 7, v16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc
@@ -200,78 +197,82 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v14, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v15, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v14, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v15, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v15, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[3:4]
-; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v16
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v16
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 7, v16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v10, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v11, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v3, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v4, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v11, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v12, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v2, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v10, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 7, v17
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v19, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v17
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[6:7]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v14, s[8:9]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v15, s[8:9]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v15, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
@@ -286,7 +287,6 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 1, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, v3, v5, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v12, v4, v6, s[4:5]
@@ -308,8 +308,6 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32
 ; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 7, v16
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 7, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc
@@ -322,16 +320,19 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[6:7]
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v14, s[8:9]
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v15, s[8:9]
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v3, v14, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, v15, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v14, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v15, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
@@ -339,37 +340,37 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v9, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v8, v10, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v12, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v13, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v12, v14, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v13, v15, s4
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v9, v11, s4
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 2, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v4, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v15, v5, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v19, v5, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v12, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v13, v7, vcc_lo
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 4, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s4
@@ -383,8 +384,8 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 7, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index e5c1d645d2cb9..18a82e8bc7a58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -320,9 +320,9 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    s_mov_b64 s[12:13], 5
 ; GCN-NEXT:    v_mov_b32_e32 v7, s10
 ; GCN-NEXT:    v_mov_b32_e32 v8, s11
-; GCN-NEXT:    s_mov_b64 s[12:13], 5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index 1d1d4d503f69b..d49007f31e6bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -733,33 +733,33 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
-; GFX6-NEXT:    v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
-; GFX6-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v17
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v19
 ; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX6-NEXT:    v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX6-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT:    v_rcp_f64_e32 v[18:19], v[12:13]
+; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[10:11]
+; GFX6-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; GFX6-NEXT:    v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17]
+; GFX6-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; GFX6-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX6-NEXT:    v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v9
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT:    v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
-; GFX6-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
-; GFX6-NEXT:    v_mul_f64 v[16:17], v[18:19], v[10:11]
-; GFX6-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0
-; GFX6-NEXT:    v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17]
-; GFX6-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
-; GFX6-NEXT:    v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3]
-; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
-; GFX6-NEXT:    v_mul_f64 v[16:17], v[12:13], v[8:9]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v13
-; GFX6-NEXT:    v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13]
+; GFX6-NEXT:    v_mul_f64 v[16:17], v[20:21], v[8:9]
+; GFX6-NEXT:    v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v21
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v13
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    s_nop 1
-; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17]
+; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
+; GFX6-NEXT:    s_nop 0
+; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -768,26 +768,26 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX8-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX8-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX8-NEXT:    v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX8-NEXT:    v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
-; GFX8-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
-; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX8-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
-; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX8-NEXT:    v_mul_f64 v[16:17], v[18:19], v[12:13]
-; GFX8-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
-; GFX8-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX8-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
+; GFX8-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
+; GFX8-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
+; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
+; GFX8-NEXT:    v_mul_f64 v[18:19], v[16:17], v[12:13]
+; GFX8-NEXT:    v_mul_f64 v[22:23], v[20:21], v[14:15]
+; GFX8-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
+; GFX8-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
+; GFX8-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX8-NEXT:    s_mov_b64 vcc, s[4:5]
-; GFX8-NEXT:    v_mul_f64 v[20:21], v[18:19], v[14:15]
+; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
-; GFX8-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
-; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -796,26 +796,26 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX9-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX9-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX9-NEXT:    v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX9-NEXT:    v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
-; GFX9-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
-; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX9-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
-; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX9-NEXT:    v_mul_f64 v[16:17], v[18:19], v[12:13]
-; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
-; GFX9-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
+; GFX9-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
+; GFX9-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
+; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
+; GFX9-NEXT:    v_mul_f64 v[18:19], v[16:17], v[12:13]
+; GFX9-NEXT:    v_mul_f64 v[22:23], v[20:21], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
+; GFX9-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
+; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
-; GFX9-NEXT:    v_mul_f64 v[20:21], v[18:19], v[14:15]
+; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
-; GFX9-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
-; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -970,33 +970,33 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
-; GFX6-NEXT:    v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
-; GFX6-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v17
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v19
 ; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX6-NEXT:    v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX6-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT:    v_rcp_f64_e32 v[18:19], v[12:13]
+; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[10:11]
+; GFX6-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; GFX6-NEXT:    v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17]
+; GFX6-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; GFX6-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX6-NEXT:    v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v9
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT:    v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
-; GFX6-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
-; GFX6-NEXT:    v_mul_f64 v[16:17], v[18:19], v[10:11]
-; GFX6-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0
-; GFX6-NEXT:    v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17]
-; GFX6-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
-; GFX6-NEXT:    v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3]
-; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
-; GFX6-NEXT:    v_mul_f64 v[16:17], v[12:13], v[8:9]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v13
-; GFX6-NEXT:    v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13]
+; GFX6-NEXT:    v_mul_f64 v[16:17], v[20:21], v[8:9]
+; GFX6-NEXT:    v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v21
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v13
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    s_nop 1
-; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17]
+; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
+; GFX6-NEXT:    s_nop 0
+; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1005,26 +1005,26 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX8-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX8-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX8-NEXT:    v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX8-NEXT:    v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
-; GFX8-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
-; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX8-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
-; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX8-NEXT:    v_mul_f64 v[16:17], v[18:19], v[12:13]
-; GFX8-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
-; GFX8-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX8-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
+; GFX8-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
+; GFX8-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
+; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
+; GFX8-NEXT:    v_mul_f64 v[18:19], v[16:17], v[12:13]
+; GFX8-NEXT:    v_mul_f64 v[22:23], v[20:21], v[14:15]
+; GFX8-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
+; GFX8-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
+; GFX8-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX8-NEXT:    s_mov_b64 vcc, s[4:5]
-; GFX8-NEXT:    v_mul_f64 v[20:21], v[18:19], v[14:15]
+; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
-; GFX8-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
-; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1033,26 +1033,26 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX9-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX9-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX9-NEXT:    v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX9-NEXT:    v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
-; GFX9-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
-; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX9-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
-; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX9-NEXT:    v_mul_f64 v[16:17], v[18:19], v[12:13]
-; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
-; GFX9-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
+; GFX9-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
+; GFX9-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
+; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
+; GFX9-NEXT:    v_mul_f64 v[18:19], v[16:17], v[12:13]
+; GFX9-NEXT:    v_mul_f64 v[22:23], v[20:21], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
+; GFX9-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
+; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
-; GFX9-NEXT:    v_mul_f64 v[20:21], v[18:19], v[14:15]
+; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
-; GFX9-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
-; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1133,7 +1133,7 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
+; GFX6-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
@@ -1142,23 +1142,23 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v9
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v9
 ; GFX6-NEXT:    v_mul_f64 v[12:13], v[8:9], v[6:7]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
+; GFX6-NEXT:    v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9]
+; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
-; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[4:5]
-; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v17
+; GFX6-NEXT:    v_mul_f64 v[8:9], v[16:17], v[4:5]
+; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13]
+; GFX6-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v17
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
 ; GFX6-NEXT:    s_nop 0
-; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
+; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1295,7 +1295,7 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
+; GFX6-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
@@ -1304,23 +1304,23 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v9
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v9
 ; GFX6-NEXT:    v_mul_f64 v[12:13], v[8:9], v[6:7]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
+; GFX6-NEXT:    v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9]
+; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
-; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[4:5]
-; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v17
+; GFX6-NEXT:    v_mul_f64 v[8:9], v[16:17], v[4:5]
+; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13]
+; GFX6-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v17
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
 ; GFX6-NEXT:    s_nop 0
-; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
+; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1533,7 +1533,7 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
+; GFX6-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
@@ -1542,23 +1542,23 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v9
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v9
 ; GFX6-NEXT:    v_mul_f64 v[12:13], v[8:9], v[6:7]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
+; GFX6-NEXT:    v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9]
+; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
-; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[4:5]
-; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v17
+; GFX6-NEXT:    v_mul_f64 v[8:9], v[16:17], v[4:5]
+; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13]
+; GFX6-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v17
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
 ; GFX6-NEXT:    s_nop 0
-; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
+; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1769,33 +1769,33 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
-; GFX6-NEXT:    v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
-; GFX6-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v17
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v19
 ; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX6-NEXT:    v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX6-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT:    v_rcp_f64_e32 v[18:19], v[12:13]
+; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[10:11]
+; GFX6-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; GFX6-NEXT:    v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17]
+; GFX6-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; GFX6-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX6-NEXT:    v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v9
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT:    v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
-; GFX6-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
-; GFX6-NEXT:    v_mul_f64 v[16:17], v[18:19], v[10:11]
-; GFX6-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0
-; GFX6-NEXT:    v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17]
-; GFX6-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
-; GFX6-NEXT:    v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3]
-; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
-; GFX6-NEXT:    v_mul_f64 v[16:17], v[12:13], v[8:9]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v13
-; GFX6-NEXT:    v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13]
+; GFX6-NEXT:    v_mul_f64 v[16:17], v[20:21], v[8:9]
+; GFX6-NEXT:    v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v21
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v13
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    s_nop 1
-; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17]
+; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
+; GFX6-NEXT:    s_nop 0
+; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1804,26 +1804,26 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX8-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX8-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX8-NEXT:    v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX8-NEXT:    v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
-; GFX8-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
-; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX8-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
-; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX8-NEXT:    v_mul_f64 v[16:17], v[18:19], v[12:13]
-; GFX8-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
-; GFX8-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX8-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
+; GFX8-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
+; GFX8-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
+; GFX8-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX8-NEXT:    v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
+; GFX8-NEXT:    v_mul_f64 v[18:19], v[16:17], v[12:13]
+; GFX8-NEXT:    v_mul_f64 v[22:23], v[20:21], v[14:15]
+; GFX8-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
+; GFX8-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
+; GFX8-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX8-NEXT:    s_mov_b64 vcc, s[4:5]
-; GFX8-NEXT:    v_mul_f64 v[20:21], v[18:19], v[14:15]
+; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
-; GFX8-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
-; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1832,26 +1832,26 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
+; GFX9-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
 ; GFX9-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX9-NEXT:    v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX9-NEXT:    v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
-; GFX9-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
-; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX9-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
-; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX9-NEXT:    v_mul_f64 v[16:17], v[18:19], v[12:13]
-; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
-; GFX9-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
+; GFX9-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
+; GFX9-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
+; GFX9-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
+; GFX9-NEXT:    v_mul_f64 v[18:19], v[16:17], v[12:13]
+; GFX9-NEXT:    v_mul_f64 v[22:23], v[20:21], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
+; GFX9-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
+; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
-; GFX9-NEXT:    v_mul_f64 v[20:21], v[18:19], v[14:15]
+; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
-; GFX9-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
-; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 78a5bc7c34f23..6171b7e19927a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -7899,90 +7899,90 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v23
-; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
-; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v23
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[8:9], 1
 ; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v16
-; GFX6-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX6-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
 ; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT:    v_lshr_b64 v[10:11], v[10:11], 1
-; GFX6-NEXT:    v_or_b32_e32 v9, v9, v17
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v24
-; GFX6-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[10:11], v16
-; GFX6-NEXT:    v_lshr_b64 v[18:19], v[8:9], v24
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX6-NEXT:    v_or_b32_e32 v18, v18, v16
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v23
-; GFX6-NEXT:    v_or_b32_e32 v19, v19, v17
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[0:1], v16
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v23
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v23
+; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v23
+; GFX6-NEXT:    v_lshr_b64 v[16:17], v[0:1], v16
+; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v23
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[0:1], v23
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v25
+; GFX6-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX6-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v25, 0, v0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX6-NEXT:    v_subrev_i32_e64 v0, s[4:5], 64, v24
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], v0
-; GFX6-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
+; GFX6-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], 1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], 1
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v24
+; GFX6-NEXT:    v_subrev_i32_e32 v23, vcc, 64, v24
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v24
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v10
+; GFX6-NEXT:    v_lshr_b64 v[16:17], v[2:3], v24
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v23
+; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v0, v25, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v18, v3
-; GFX6-NEXT:    v_or_b32_e32 v2, v17, v8
-; GFX6-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX6-NEXT:    v_and_b32_e32 v16, 0x7f, v20
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX6-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v20
-; GFX6-NEXT:    v_and_b32_e32 v17, 0x7f, v8
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX6-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v18
+; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v18
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v8
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v16
-; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v16
-; GFX6-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX6-NEXT:    v_or_b32_e32 v11, v9, v11
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], v16
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[4:5], v18
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[4:5], v18
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[4:5], v20
+; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v17
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v17
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v19
+; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v19
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v19
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, 64, v17
-; GFX6-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX6-NEXT:    v_or_b32_e32 v11, v9, v11
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[6:7], v17
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v12
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GFX6-NEXT:    v_lshr_b64 v[12:13], v[6:7], v19
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v14
+; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v4, v18, v4
-; GFX6-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX6-NEXT:    v_or_b32_e32 v6, v16, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
+; GFX6-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
+; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
+; GFX6-NEXT:    v_or_b32_e32 v6, v18, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v20, v7
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7990,90 +7990,90 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v23
-; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
 ; GFX8-NEXT:    v_xor_b32_e32 v16, -1, v16
-; GFX8-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX8-NEXT:    v_or_b32_e32 v9, v9, v17
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v24
-; GFX8-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[10:11]
-; GFX8-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX8-NEXT:    v_or_b32_e32 v18, v18, v16
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v23
-; GFX8-NEXT:    v_or_b32_e32 v19, v19, v17
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v23
+; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v23
+; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v16, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v23, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX8-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, 0, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX8-NEXT:    v_subrev_u32_e64 v0, s[4:5], 64, v24
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v0, v[10:11]
-; GFX8-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v24
+; GFX8-NEXT:    v_subrev_u32_e32 v23, vcc, 64, v24
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v0, v25, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v18, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, v17, v8
-; GFX8-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX8-NEXT:    v_and_b32_e32 v16, 0x7f, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v20
-; GFX8-NEXT:    v_and_b32_e32 v17, 0x7f, v8
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX8-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v18
+; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v18
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v16, v[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v18, vcc, 64, v16
-; GFX8-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX8-NEXT:    v_or_b32_e32 v11, v9, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v16, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v18, v[4:5]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v20, v[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v17
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v17, v[4:5]
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v19
+; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v19
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v19, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v12, vcc, 64, v17
-; GFX8-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX8-NEXT:    v_or_b32_e32 v11, v9, v11
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v17, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v12, v[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GFX8-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v4, v18, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX8-NEXT:    v_or_b32_e32 v6, v16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
+; GFX8-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v18, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v20, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8081,90 +8081,90 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
-; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v23
-; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
 ; GFX9-NEXT:    v_xor_b32_e32 v16, -1, v16
-; GFX9-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX9-NEXT:    v_or_b32_e32 v9, v9, v17
-; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v24
-; GFX9-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v16, v[10:11]
-; GFX9-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v23
+; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v16, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v23, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX9-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX9-NEXT:    v_or_b32_e32 v18, v18, v16
-; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v23
-; GFX9-NEXT:    v_or_b32_e32 v19, v19, v17
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v16, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, 0, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX9-NEXT:    v_subrev_u32_e32 v0, 64, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v0, v[10:11]
-; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
+; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v24
+; GFX9-NEXT:    v_subrev_u32_e32 v23, 64, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v0, v25, v2
-; GFX9-NEXT:    v_or_b32_e32 v1, v18, v3
-; GFX9-NEXT:    v_or_b32_e32 v2, v17, v8
-; GFX9-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX9-NEXT:    v_and_b32_e32 v16, 0x7f, v20
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v20
-; GFX9-NEXT:    v_and_b32_e32 v17, 0x7f, v8
-; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v16
+; GFX9-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v18
+; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v18
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v16, v[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v18, 64, v16
-; GFX9-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX9-NEXT:    v_or_b32_e32 v11, v9, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v16, v[4:5]
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v18, v[4:5]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[4:5]
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v17
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v17, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v19
+; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v19
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v19, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v12, 64, v17
-; GFX9-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX9-NEXT:    v_or_b32_e32 v11, v9, v11
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v17, v[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v12, v[6:7]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v4, v18, v4
-; GFX9-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX9-NEXT:    v_or_b32_e32 v6, v16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
+; GFX9-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v18, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v20, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 374455a65bdcf..f81242b57d320 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -8041,274 +8041,274 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshr_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_xor_b32_e32 v17, -1, v16
+; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v17
-; GFX6-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v23
-; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
-; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v23
 ; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v24
-; GFX6-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX6-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[10:11], v16
-; GFX6-NEXT:    v_lshr_b64 v[18:19], v[8:9], v24
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX6-NEXT:    v_or_b32_e32 v18, v18, v16
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v23
-; GFX6-NEXT:    v_or_b32_e32 v19, v19, v17
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[0:1], v16
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v23
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v25, 0, v0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX6-NEXT:    v_subrev_i32_e64 v0, s[4:5], 64, v24
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], v0
-; GFX6-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[0:1], 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v24
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[16:17], v0
+; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v24
+; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v24
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[16:17], v24
+; GFX6-NEXT:    v_or_b32_e32 v18, v0, v18
+; GFX6-NEXT:    v_or_b32_e32 v19, v1, v19
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[16:17], v25
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX6-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v0, v25, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v17, v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v23
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v23
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
+; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v23
+; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
+; GFX6-NEXT:    v_lshr_b64 v[16:17], v[10:11], v23
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v20
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
-; GFX6-NEXT:    v_or_b32_e32 v1, v18, v3
-; GFX6-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX6-NEXT:    v_and_b32_e32 v17, 0x7f, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX6-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v17
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v19
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v17
-; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v17
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v19
+; GFX6-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
+; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v19
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v19
 ; GFX6-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX6-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v17
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[8:9], v18
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX6-NEXT:    v_and_b32_e32 v16, 0x7f, v20
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, v5, v7, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v16
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], v16
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v20
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v18
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], v18
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[14:15], v6
-; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, 64, v16
-; GFX6-NEXT:    v_or_b32_e32 v11, v4, v6
-; GFX6-NEXT:    v_or_b32_e32 v17, v5, v7
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], v10
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v16
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
-; GFX6-NEXT:    v_or_b32_e32 v4, v18, v6
-; GFX6-NEXT:    v_or_b32_e32 v5, v19, v7
-; GFX6-NEXT:    v_or_b32_e32 v6, v8, v10
-; GFX6-NEXT:    v_or_b32_e32 v7, v9, v11
+; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX6-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v19
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[14:15], v18
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
+; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
+; GFX6-NEXT:    v_or_b32_e32 v6, v10, v6
+; GFX6-NEXT:    v_or_b32_e32 v7, v11, v7
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v17, -1, v16
+; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX8-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v23
-; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v24
-; GFX8-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX8-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[10:11]
-; GFX8-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX8-NEXT:    v_or_b32_e32 v18, v18, v16
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v23
-; GFX8-NEXT:    v_or_b32_e32 v19, v19, v17
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, 0, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX8-NEXT:    v_subrev_u32_e64 v0, s[4:5], 64, v24
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v0, v[10:11]
-; GFX8-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v24
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
+; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v24
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
+; GFX8-NEXT:    v_or_b32_e32 v18, v0, v18
+; GFX8-NEXT:    v_or_b32_e32 v19, v1, v19
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v0, v25, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v17, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v23
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
+; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v23
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
+; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v20
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX8-NEXT:    v_or_b32_e32 v1, v18, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0x7f, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v17
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v19
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v17, v[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v18, vcc, 64, v17
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
+; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v19
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX8-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v17, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v18, v[8:9]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX8-NEXT:    v_and_b32_e32 v16, 0x7f, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v5, v7, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v16
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v16, v[12:13]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v18
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v18, v[12:13]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, 64, v16
-; GFX8-NEXT:    v_or_b32_e32 v11, v4, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, v5, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v10, v[14:15]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v16, v[14:15]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v4, v18, v6
-; GFX8-NEXT:    v_or_b32_e32 v5, v19, v7
-; GFX8-NEXT:    v_or_b32_e32 v6, v8, v10
-; GFX8-NEXT:    v_or_b32_e32 v7, v9, v11
+; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
+; GFX8-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v19, v[14:15]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v18, v[14:15]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v10, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v11, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v17, -1, v16
+; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX9-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v23
-; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v24
-; GFX9-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX9-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v16, v[10:11]
-; GFX9-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
+; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v24
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
+; GFX9-NEXT:    v_or_b32_e32 v18, v0, v18
+; GFX9-NEXT:    v_or_b32_e32 v19, v1, v19
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
+; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX9-NEXT:    v_or_b32_e32 v18, v18, v16
-; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v23
-; GFX9-NEXT:    v_or_b32_e32 v19, v19, v17
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v16, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, 0, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX9-NEXT:    v_subrev_u32_e32 v0, 64, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v0, v[10:11]
-; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v0, v25, v2
-; GFX9-NEXT:    v_or_b32_e32 v2, v17, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v20
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v1, v18, v3
-; GFX9-NEXT:    v_or_b32_e32 v3, v16, v9
-; GFX9-NEXT:    v_and_b32_e32 v17, 0x7f, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX9-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v17
+; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v17, v[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v18, 64, v17
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
+; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v19
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX9-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v17, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v18, v[8:9]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX9-NEXT:    v_and_b32_e32 v16, 0x7f, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v6, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v16, v[12:13]
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v18, v[12:13]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX9-NEXT:    v_subrev_u32_e32 v10, 64, v16
-; GFX9-NEXT:    v_or_b32_e32 v11, v4, v6
-; GFX9-NEXT:    v_or_b32_e32 v17, v5, v7
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v10, v[14:15]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v16, v[14:15]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
-; GFX9-NEXT:    v_or_b32_e32 v4, v18, v6
-; GFX9-NEXT:    v_or_b32_e32 v5, v19, v7
-; GFX9-NEXT:    v_or_b32_e32 v6, v8, v10
-; GFX9-NEXT:    v_or_b32_e32 v7, v9, v11
+; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v19, v[14:15]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v18, v[14:15]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v22, v1
+; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v10, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, v11, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i128:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 72fdd481dbb67..4e76e48d07211 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -904,23 +904,23 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX10-NEXT:    v_mov_b32_e32 v18, s19
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 3, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 2, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 4, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 2, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 3, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 5, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 4, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 6, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v0, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v0, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v0, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v0, s10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v0, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v1, s10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v1, s7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v0, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s9

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index ef8db23aaa672..bb7770701a631 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -7,111 +7,107 @@ declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1
 define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
 ; LOOP-LABEL: memcpy_p1i8:
 ; LOOP:       ; %bb.0:
-; LOOP-NEXT:    s_mov_b32 s2, 0
-; LOOP-NEXT:    s_mov_b32 s3, 0xf000
-; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    s_mov_b32 s6, 0
+; LOOP-NEXT:    s_mov_b32 s7, 0xf000
+; LOOP-NEXT:    s_mov_b64 s[4:5], 0
 ; LOOP-NEXT:    v_mov_b32_e32 v5, v3
 ; LOOP-NEXT:    v_mov_b32_e32 v4, v2
 ; LOOP-NEXT:    v_mov_b32_e32 v7, v1
 ; LOOP-NEXT:    v_mov_b32_e32 v6, v0
-; LOOP-NEXT:    v_mov_b32_e32 v8, s2
+; LOOP-NEXT:    v_mov_b32_e32 v8, s6
 ; LOOP-NEXT:  .LBB0_1: ; %load-store-loop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT:    s_waitcnt expcnt(1)
-; LOOP-NEXT:    buffer_load_ubyte v9, v[4:5], s[0:3], 0 addr64
+; LOOP-NEXT:    buffer_load_ubyte v9, v[4:5], s[4:7], 0 addr64
+; LOOP-NEXT:    s_waitcnt expcnt(6)
+; LOOP-NEXT:    buffer_load_ubyte v10, v[4:5], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT:    s_waitcnt expcnt(3)
+; LOOP-NEXT:    buffer_load_ubyte v11, v[4:5], s[4:7], 0 addr64 offset:2
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v10, v[4:5], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    buffer_load_ubyte v12, v[4:5], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v13, v[4:5], s[4:7], 0 addr64 offset:4
+; LOOP-NEXT:    buffer_load_ubyte v14, v[4:5], s[4:7], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_load_ubyte v15, v[4:5], s[4:7], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_load_ubyte v16, v[4:5], s[4:7], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_load_ubyte v17, v[4:5], s[4:7], 0 addr64 offset:8
+; LOOP-NEXT:    buffer_load_ubyte v18, v[4:5], s[4:7], 0 addr64 offset:9
+; LOOP-NEXT:    buffer_load_ubyte v19, v[4:5], s[4:7], 0 addr64 offset:10
+; LOOP-NEXT:    buffer_load_ubyte v20, v[4:5], s[4:7], 0 addr64 offset:11
+; LOOP-NEXT:    buffer_load_ubyte v21, v[4:5], s[4:7], 0 addr64 offset:12
+; LOOP-NEXT:    buffer_load_ubyte v22, v[4:5], s[4:7], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_load_ubyte v23, v[4:5], s[4:7], 0 addr64 offset:14
+; LOOP-NEXT:    buffer_load_ubyte v24, v[4:5], s[4:7], 0 addr64 offset:15
+; LOOP-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
+; LOOP-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; LOOP-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; LOOP-NEXT:    s_and_b64 vcc, s[0:1], exec
+; LOOP-NEXT:    s_waitcnt vmcnt(14)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; LOOP-NEXT:    v_or_b32_e32 v9, v10, v9
-; LOOP-NEXT:    buffer_load_ubyte v10, v[4:5], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_load_ubyte v11, v[4:5], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; LOOP-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; LOOP-NEXT:    v_or_b32_e32 v10, v11, v10
-; LOOP-NEXT:    v_or_b32_e32 v9, v10, v9
-; LOOP-NEXT:    buffer_load_ubyte v10, v[4:5], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT:    buffer_load_ubyte v11, v[4:5], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; LOOP-NEXT:    v_or_b32_e32 v10, v11, v10
-; LOOP-NEXT:    buffer_load_ubyte v11, v[4:5], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_load_ubyte v12, v[4:5], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    s_waitcnt vmcnt(12)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; LOOP-NEXT:    v_or_b32_e32 v11, v12, v11
-; LOOP-NEXT:    v_or_b32_e32 v10, v11, v10
-; LOOP-NEXT:    buffer_load_ubyte v11, v[4:5], s[0:3], 0 addr64 offset:8
-; LOOP-NEXT:    buffer_load_ubyte v12, v[4:5], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; LOOP-NEXT:    v_or_b32_e32 v11, v12, v11
-; LOOP-NEXT:    buffer_load_ubyte v12, v[4:5], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT:    buffer_load_ubyte v13, v[4:5], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; LOOP-NEXT:    v_or_b32_e32 v12, v13, v12
-; LOOP-NEXT:    v_or_b32_e32 v11, v12, v11
-; LOOP-NEXT:    buffer_load_ubyte v12, v[4:5], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT:    buffer_load_ubyte v13, v[4:5], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; LOOP-NEXT:    v_or_b32_e32 v12, v13, v12
-; LOOP-NEXT:    buffer_load_ubyte v13, v[4:5], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_load_ubyte v14, v[4:5], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    s_waitcnt vmcnt(10)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; LOOP-NEXT:    s_waitcnt vmcnt(8)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
+; LOOP-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; LOOP-NEXT:    s_waitcnt vmcnt(6)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
+; LOOP-NEXT:    s_waitcnt vmcnt(4)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v20, 24, v20
+; LOOP-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
 ; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
-; LOOP-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; LOOP-NEXT:    v_or_b32_e32 v13, v14, v13
-; LOOP-NEXT:    v_or_b32_e32 v12, v13, v12
+; LOOP-NEXT:    v_lshlrev_b32_e32 v24, 24, v24
+; LOOP-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; LOOP-NEXT:    v_or_b32_e32 v9, v10, v9
+; LOOP-NEXT:    v_or_b32_e32 v10, v12, v11
+; LOOP-NEXT:    v_or_b32_e32 v11, v14, v13
+; LOOP-NEXT:    v_or_b32_e32 v12, v16, v15
+; LOOP-NEXT:    v_or_b32_e32 v13, v18, v17
+; LOOP-NEXT:    v_or_b32_e32 v14, v20, v19
+; LOOP-NEXT:    v_or_b32_e32 v15, v22, v21
+; LOOP-NEXT:    v_or_b32_e32 v16, v24, v23
+; LOOP-NEXT:    v_or_b32_e32 v9, v10, v9
+; LOOP-NEXT:    v_or_b32_e32 v10, v12, v11
+; LOOP-NEXT:    v_or_b32_e32 v11, v14, v13
+; LOOP-NEXT:    v_or_b32_e32 v12, v16, v15
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
 ; LOOP-NEXT:    v_bfe_u32 v14, v9, 8, 8
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT:    s_waitcnt expcnt(1)
+; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[4:7], 0 addr64
+; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v9, 24, v9
-; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    v_lshrrev_b32_e32 v15, 16, v10
+; LOOP-NEXT:    v_bfe_u32 v16, v10, 8, 8
+; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:4
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
-; LOOP-NEXT:    v_bfe_u32 v13, v10, 8, 8
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT:    s_waitcnt expcnt(1)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v10, 24, v10
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT:    s_waitcnt expcnt(1)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_bfe_u32 v10, v11, 8, 8
-; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:8
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT:    v_lshrrev_b32_e32 v17, 16, v11
+; LOOP-NEXT:    v_bfe_u32 v18, v11, 8, 8
+; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:8
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v10, 24, v11
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT:    s_waitcnt expcnt(1)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; LOOP-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
+; LOOP-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
+; LOOP-NEXT:    v_bfe_u32 v20, v12, 8, 8
+; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:12
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_bfe_u32 v10, v12, 8, 8
-; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v10, 24, v12
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
-; LOOP-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, 16, v6
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
-; LOOP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; LOOP-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; LOOP-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 24, v12
+; LOOP-NEXT:    buffer_store_byte v14, v[6:7], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_store_byte v16, v[6:7], s[4:7], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_store_byte v15, v[6:7], s[4:7], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_store_byte v18, v[6:7], s[4:7], 0 addr64 offset:9
+; LOOP-NEXT:    buffer_store_byte v17, v[6:7], s[4:7], 0 addr64 offset:10
+; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:11
+; LOOP-NEXT:    buffer_store_byte v20, v[6:7], s[4:7], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:14
+; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:15
+; LOOP-NEXT:    v_add_i32_e64 v6, s[0:1], 16, v6
+; LOOP-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, v7, s[0:1]
+; LOOP-NEXT:    v_add_i32_e64 v4, s[0:1], 16, v4
+; LOOP-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, v5, s[0:1]
 ; LOOP-NEXT:    s_cbranch_vccnz .LBB0_1
 ; LOOP-NEXT:  ; %bb.2: ; %memcpy-split
 ; LOOP-NEXT:    s_mov_b32 s2, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index d5ad0062aff3e..50f62aac1a526 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1645,208 +1645,208 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-LABEL: v_mul_i256:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX7-NEXT:    v_mul_lo_u32 v28, v4, v11
-; GFX7-NEXT:    v_mul_lo_u32 v27, v5, v10
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
-; GFX7-NEXT:    v_addc_u32_e32 v25, vcc, 0, v20, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v20, v18
-; GFX7-NEXT:    v_mov_b32_e32 v18, v19
-; GFX7-NEXT:    v_mov_b32_e32 v19, v16
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v16, v6, v9
-; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
-; GFX7-NEXT:    v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
-; GFX7-NEXT:    v_mov_b32_e32 v19, v22
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
-; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
-; GFX7-NEXT:    v_mul_lo_u32 v24, v3, v12
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
-; GFX7-NEXT:    v_mul_lo_u32 v22, v2, v13
-; GFX7-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
-; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX7-NEXT:    v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
-; GFX7-NEXT:    v_mov_b32_e32 v20, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
-; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
-; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v10
+; GFX7-NEXT:    v_mov_b32_e32 v16, v0
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
+; GFX7-NEXT:    v_mov_b32_e32 v17, v1
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
+; GFX7-NEXT:    v_addc_u32_e32 v25, vcc, 0, v24, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
+; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
+; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v18, v23
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
+; GFX7-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
+; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, v20
+; GFX7-NEXT:    v_mov_b32_e32 v1, v23
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
+; GFX7-NEXT:    v_mul_lo_u32 v20, v6, v9
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
+; GFX7-NEXT:    v_mul_lo_u32 v23, v5, v10
+; GFX7-NEXT:    v_mul_lo_u32 v26, v4, v11
+; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX7-NEXT:    v_mul_lo_u32 v13, v2, v13
+; GFX7-NEXT:    v_mov_b32_e32 v2, v22
+; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX7-NEXT:    v_mul_lo_u32 v12, v3, v12
+; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
+; GFX7-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
+; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
+; GFX7-NEXT:    v_mul_lo_u32 v10, v16, v15
+; GFX7-NEXT:    v_mul_lo_u32 v9, v17, v14
+; GFX7-NEXT:    v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
+; GFX7-NEXT:    v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
+; GFX7-NEXT:    v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
+; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
+; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
+; GFX7-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i256:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX8-NEXT:    v_mul_lo_u32 v28, v4, v11
-; GFX8-NEXT:    v_mul_lo_u32 v27, v5, v10
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
-; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v20, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v20, v18
-; GFX8-NEXT:    v_mov_b32_e32 v18, v19
-; GFX8-NEXT:    v_mov_b32_e32 v19, v16
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v16, v6, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
-; GFX8-NEXT:    v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
-; GFX8-NEXT:    v_mov_b32_e32 v19, v22
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
-; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
-; GFX8-NEXT:    v_mul_lo_u32 v24, v3, v12
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
-; GFX8-NEXT:    v_mul_lo_u32 v22, v2, v13
-; GFX8-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX8-NEXT:    v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
-; GFX8-NEXT:    v_mov_b32_e32 v20, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
-; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v10
+; GFX8-NEXT:    v_mov_b32_e32 v16, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
+; GFX8-NEXT:    v_mov_b32_e32 v17, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v24, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v18, v23
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
+; GFX8-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, v20
+; GFX8-NEXT:    v_mov_b32_e32 v1, v23
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v20, v6, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
+; GFX8-NEXT:    v_mul_lo_u32 v23, v5, v10
+; GFX8-NEXT:    v_mul_lo_u32 v26, v4, v11
+; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX8-NEXT:    v_mul_lo_u32 v13, v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v2, v22
+; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX8-NEXT:    v_mul_lo_u32 v12, v3, v12
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
+; GFX8-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
+; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
+; GFX8-NEXT:    v_mul_lo_u32 v10, v16, v15
+; GFX8-NEXT:    v_mul_lo_u32 v9, v17, v14
+; GFX8-NEXT:    v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
+; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
+; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
+; GFX8-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i256:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX9-NEXT:    v_mul_lo_u32 v28, v4, v11
-; GFX9-NEXT:    v_mul_lo_u32 v27, v5, v10
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
-; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v20, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v20, v18
-; GFX9-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-NEXT:    v_mov_b32_e32 v19, v16
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v16, v6, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
-; GFX9-NEXT:    v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
-; GFX9-NEXT:    v_mov_b32_e32 v19, v22
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
-; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
-; GFX9-NEXT:    v_mul_lo_u32 v24, v3, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
-; GFX9-NEXT:    v_mul_lo_u32 v22, v2, v13
-; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13]
-; GFX9-NEXT:    v_mov_b32_e32 v20, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13]
-; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13]
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v10
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
+; GFX9-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v24, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
+; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
+; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
+; GFX9-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, v20
+; GFX9-NEXT:    v_mov_b32_e32 v1, v23
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v20, v6, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
+; GFX9-NEXT:    v_mul_lo_u32 v23, v5, v10
+; GFX9-NEXT:    v_mul_lo_u32 v26, v4, v11
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX9-NEXT:    v_mul_lo_u32 v13, v2, v13
+; GFX9-NEXT:    v_mov_b32_e32 v2, v22
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX9-NEXT:    v_mul_lo_u32 v12, v3, v12
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
+; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9]
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9]
+; GFX9-NEXT:    v_mul_lo_u32 v10, v16, v15
+; GFX9-NEXT:    v_mul_lo_u32 v9, v17, v14
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9]
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9]
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9]
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15]
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i256:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index ea75cbc42c119..0f2ea13d4752f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -2007,9 +2007,8 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v17
 ; GFX6-NEXT:    v_min_i32_e32 v17, 0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX6-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GFX6-NEXT:    v_max_i32_e32 v19, 0, v4
+; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
@@ -2026,69 +2025,70 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
 ; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    buffer_load_dword v19, off, s[0:3], s32
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
 ; GFX6-NEXT:    v_min_i32_e32 v17, 0, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v7
+; GFX6-NEXT:    v_max_i32_e32 v20, 0, v7
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v18, v20
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_min_i32_e32 v20, 0, v8
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v8
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v8
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v24
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v8
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
+; GFX6-NEXT:    v_max_i32_e32 v20, v20, v24
+; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX6-NEXT:    v_min_i32_e32 v20, 0, v9
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v9
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v9
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v25
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v9
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
+; GFX6-NEXT:    v_max_i32_e32 v20, v20, v25
+; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX6-NEXT:    v_min_i32_e32 v20, 0, v10
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v10
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v10
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v26
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v10
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
+; GFX6-NEXT:    v_max_i32_e32 v20, v20, v26
+; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX6-NEXT:    v_min_i32_e32 v20, 0, v11
 ; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v11
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v11
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v27
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v11
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
+; GFX6-NEXT:    v_max_i32_e32 v20, v20, v27
+; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX6-NEXT:    v_min_i32_e32 v20, 0, v12
 ; GFX6-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v12
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v12
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v28
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v12
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
+; GFX6-NEXT:    v_max_i32_e32 v20, v20, v28
+; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX6-NEXT:    v_min_i32_e32 v20, 0, v13
 ; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v13
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v13
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v29
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v13
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
+; GFX6-NEXT:    v_max_i32_e32 v20, v20, v29
+; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX6-NEXT:    v_min_i32_e32 v20, 0, v14
 ; GFX6-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v14
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v14
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v30
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v14
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
+; GFX6-NEXT:    v_max_i32_e32 v20, v20, v30
+; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
 ; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v15
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
 ; GFX6-NEXT:    v_min_i32_e32 v18, 0, v15
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v18
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v20
+; GFX6-NEXT:    v_max_i32_e32 v16, v16, v19
 ; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
 ; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2130,9 +2130,8 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v17
 ; GFX8-NEXT:    v_min_i32_e32 v17, 0, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GFX8-NEXT:    v_max_i32_e32 v19, 0, v4
+; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
@@ -2149,69 +2148,70 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
 ; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v17
 ; GFX8-NEXT:    v_min_i32_e32 v17, 0, v7
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v7
+; GFX8-NEXT:    v_max_i32_e32 v20, 0, v7
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v18, v20
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_min_i32_e32 v20, 0, v8
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v8
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v8
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v24
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_max_i32_e32 v17, 0, v8
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
+; GFX8-NEXT:    v_max_i32_e32 v20, v20, v24
+; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX8-NEXT:    v_min_i32_e32 v20, 0, v9
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v9
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v9
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v25
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_max_i32_e32 v17, 0, v9
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
+; GFX8-NEXT:    v_max_i32_e32 v20, v20, v25
+; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX8-NEXT:    v_min_i32_e32 v20, 0, v10
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v10
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v10
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v26
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_max_i32_e32 v17, 0, v10
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
+; GFX8-NEXT:    v_max_i32_e32 v20, v20, v26
+; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX8-NEXT:    v_min_i32_e32 v20, 0, v11
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v11
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v11
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v27
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_max_i32_e32 v17, 0, v11
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
+; GFX8-NEXT:    v_max_i32_e32 v20, v20, v27
+; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX8-NEXT:    v_min_i32_e32 v20, 0, v12
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v12
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v12
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v28
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_max_i32_e32 v17, 0, v12
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
+; GFX8-NEXT:    v_max_i32_e32 v20, v20, v28
+; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX8-NEXT:    v_min_i32_e32 v20, 0, v13
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v13
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v13
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v29
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_max_i32_e32 v17, 0, v13
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
+; GFX8-NEXT:    v_max_i32_e32 v20, v20, v29
+; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX8-NEXT:    v_min_i32_e32 v20, 0, v14
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v14
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v14
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v30
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_max_i32_e32 v17, 0, v14
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
+; GFX8-NEXT:    v_max_i32_e32 v20, v20, v30
+; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v15
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
 ; GFX8-NEXT:    v_min_i32_e32 v18, 0, v15
 ; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v16, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_i32_e32 v16, v16, v20
+; GFX8-NEXT:    v_max_i32_e32 v16, v16, v19
 ; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 2f422cd9510d1..673dda8f59ee8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -426,24 +426,24 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v13, v0, v9
+; GISEL-NEXT:    v_xor_b32_e32 v12, v0, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v0, v15, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v11
 ; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_mul_hi_u32 v13, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -452,189 +452,189 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v15, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, v14, v1
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v1
+; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v1
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v15, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v12, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v5, v16, v[1:2]
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v4, v15, v[10:11]
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v5, v15, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[10:11]
+; GISEL-NEXT:    v_xor_b32_e32 v8, v9, v8
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v14, v10, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], v14, v10
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v7
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v10
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v7, v10, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v1, v10
-; GISEL-NEXT:    v_xor_b32_e32 v12, v6, v10
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v10
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v12
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v14, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v13
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v14
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v14, v11, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v0, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v11, v1
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
-; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, 0, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v18, v14, 0
-; GISEL-NEXT:    v_subb_u32_e32 v19, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v5
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v13, v5
-; GISEL-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v18, v11, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v11, v0
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v20, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v19, v14, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v17, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v5
-; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v17, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v16, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v20, v21, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v20, vcc, 1, v6
-; GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v20, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v17, v17, v21, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v20, v13
+; GISEL-NEXT:    v_trunc_f32_e32 v18, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v18
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v19, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v20, vcc, 0, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v18
+; GISEL-NEXT:    v_subb_u32_e32 v21, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v12, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v14, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v20, v18, v[1:2]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v16
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v21, v19, v[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v16, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v12, v19, v4
+; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v19, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v4, v18, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v11, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v8
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v13, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v16, v17, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v5, v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v4
-; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v19, v0
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v18, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v12, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v13, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v20, v16, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v15, v14, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v21, v12, v[4:5]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v5, v2, v9
+; GISEL-NEXT:    v_mul_lo_u32 v2, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v4
+; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v4
+; GISEL-NEXT:    v_mul_lo_u32 v3, v16, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v16, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v5, v2
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v13, v15, v2
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_xor_b32_e32 v11, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v2
+; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT:    v_xor_b32_e32 v9, v14, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v11, v[3:4]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v11, v8, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v15, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v12
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v6
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v12
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v6
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v8, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v12
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v6, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v4, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
@@ -2611,89 +2611,89 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v11, v9
 ; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v7, v15, v[1:2]
 ; GISEL-NEXT:    v_lshl_b64 v[11:12], s[4:5], v6
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v12
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v13, v0
 ; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v14, v[9:10]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v16, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v16, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v6, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v9, v10, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v12
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v6
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v12, v6, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v12, v10, v6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v11
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v12
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v16, v9
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v14
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v15, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v16, v9, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, v0, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v10, v1
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v10
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v0
-; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, 0, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v21, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v18, v16, 0
-; GISEL-NEXT:    v_subb_u32_e32 v19, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v7
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v13, v7
-; GISEL-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v18, v21, v[1:2]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v7
-; GISEL-NEXT:    v_mul_lo_u32 v1, v21, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v19, v16, v[9:10]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v20, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v9
-; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; GISEL-NEXT:    v_mul_hi_u32 v0, v21, v0
-; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[8:9]
+; GISEL-NEXT:    v_trunc_f32_e32 v18, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v18
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v19, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v17, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v17, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v13, v20, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v15, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v7
-; GISEL-NEXT:    v_addc_u32_e32 v20, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v20, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v20, vcc, 0, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v18
+; GISEL-NEXT:    v_subb_u32_e32 v21, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v10, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v20, v18, v[1:2]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v16
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v21, v19, v[9:10]
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v16, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v19, v9
+; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v9
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_lo_u32 v10, v21, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v9, v21, v9
+; GISEL-NEXT:    v_mul_hi_u32 v10, v19, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
+; GISEL-NEXT:    v_mul_hi_u32 v9, v18, v9
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v0
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v21, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v18, v9, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v10, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v7, v8
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v15, v13, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v5, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v19, v0
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v18, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v9, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v14, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v20, v10, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v13, v8
+; GISEL-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v21, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v5, v2, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v10, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v4
-; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v7
+; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
@@ -2717,32 +2717,32 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v10, v2, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v5, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_xor_b32_e32 v9, v13, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v15, v2
+; GISEL-NEXT:    v_mul_lo_u32 v9, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v13, v15, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v9, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v10, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v9, v[3:4]
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v8, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v15, v3
@@ -2751,27 +2751,27 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v11
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v12
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v8, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v7, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v4, v13, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -3302,90 +3302,90 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v11
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v11, v4
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v11
+; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v6, v7, s[4:5]
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v16, v14, 0
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v10, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v15, v[0:1]
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, 1, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v17, v14, v[5:6]
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v15, v11, v[0:1]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v14, v[2:3]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v1
-; GISEL-NEXT:    v_mul_lo_u32 v1, v11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v4
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v11, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v2
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v2, v11, v2
+; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v5
+; GISEL-NEXT:    v_mul_hi_u32 v3, v15, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GISEL-NEXT:    v_mul_hi_u32 v2, v14, v5
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v15, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v11, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v15, v4, 0
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v17
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v15, v5, v[1:2]
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v18, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v15, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v3, 0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v18
 ; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v16, v4, v[1:2]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v17, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v18, v11, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v9, v3, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v5, v0
-; GISEL-NEXT:    v_mul_lo_u32 v3, v4, v1
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v19, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v17, v3, v[1:2]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v2, v4, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], 0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v13, v3, v0
 ; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v1
-; GISEL-NEXT:    v_mul_hi_u32 v0, v5, v0
-; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v3, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v4, v1
+; GISEL-NEXT:    v_mul_lo_u32 v13, v4, v1
+; GISEL-NEXT:    v_mul_hi_u32 v0, v4, v0
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v1
 ; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v13, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v13, v3
-; GISEL-NEXT:    v_mul_hi_u32 v1, v5, v1
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v1, v4, v1
 ; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v3, v2
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v7, v2
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v2
-; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v4, v0
-; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], v5, v1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v3, v0
+; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], v4, v1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v9, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v12, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -3407,9 +3407,9 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v1, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v12, v[0:1]
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v6
+; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v7, v[3:4]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v11, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v11, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index a922e0b46dfed..f6dc2127ec9e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1286,17 +1286,17 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    s_add_u32 s6, s12, s16
 ; GFX8-NEXT:    s_mov_b32 s17, s16
 ; GFX8-NEXT:    s_addc_u32 s7, s13, s16
-; GFX8-NEXT:    s_xor_b64 s[6:7], s[6:7], s[16:17]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX8-NEXT:    s_xor_b64 s[8:9], s[6:7], s[16:17]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX8-NEXT:    s_mov_b32 s3, s2
-; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
+; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_sub_u32 s12, 0, s6
-; GFX8-NEXT:    s_subb_u32 s13, 0, s7
-; GFX8-NEXT:    s_xor_b64 s[16:17], s[2:3], s[16:17]
+; GFX8-NEXT:    s_sub_u32 s6, 0, s8
+; GFX8-NEXT:    s_subb_u32 s7, 0, s9
+; GFX8-NEXT:    s_xor_b64 s[18:19], s[2:3], s[16:17]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1304,10 +1304,12 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v4, v[1:2]
+; GFX8-NEXT:    s_ashr_i32 s16, s15, 31
+; GFX8-NEXT:    s_mov_b32 s17, s16
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2]
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s13, v3, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2]
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
@@ -1330,15 +1332,14 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v4, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2]
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    s_ashr_i32 s12, s15, 31
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s13, v3, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2]
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    s_mov_b32 s13, s12
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
@@ -1358,64 +1359,64 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s9, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX8-NEXT:    v_mul_hi_u32 v5, s13, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v4, s9, v1
+; GFX8-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, s12, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s9
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s8, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NEXT:    s_ashr_i32 s8, s11, 31
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v6, s13
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s12, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v6
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s13, v1
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
 ; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s6, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s8, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
 ; GFX8-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], 1, v4
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v1
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT:    s_add_u32 s0, s10, s8
-; GFX8-NEXT:    s_addc_u32 s1, s11, s8
-; GFX8-NEXT:    s_add_u32 s10, s14, s12
-; GFX8-NEXT:    s_addc_u32 s11, s15, s12
-; GFX8-NEXT:    s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT:    s_add_u32 s0, s10, s12
+; GFX8-NEXT:    s_addc_u32 s1, s11, s12
+; GFX8-NEXT:    s_add_u32 s10, s14, s16
+; GFX8-NEXT:    s_addc_u32 s11, s15, s16
+; GFX8-NEXT:    s_xor_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v14, s11
 ; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s10
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s6, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s8, v8
 ; GFX8-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v14
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v5
@@ -1429,19 +1430,19 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v12
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v13, v0
-; GFX8-NEXT:    s_mov_b32 s9, s8
-; GFX8-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
+; GFX8-NEXT:    s_mov_b32 s13, s12
+; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
 ; GFX8-NEXT:    s_sub_u32 s3, 0, s10
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v13, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v12
-; GFX8-NEXT:    s_subb_u32 s18, 0, s11
+; GFX8-NEXT:    s_subb_u32 s20, 0, s11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v3, v10, vcc
 ; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v5, v[1:2]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v15, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], s18, v13, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], s20, v13, v[1:2]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v9, v16, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v0
@@ -1469,13 +1470,13 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v13, v0
 ; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v8, 0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, s16, v4
+; GFX8-NEXT:    v_xor_b32_e32 v1, s18, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v5, v[0:1]
-; GFX8-NEXT:    v_xor_b32_e32 v9, s17, v10
-; GFX8-NEXT:    v_mov_b32_e32 v10, s17
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s18, v8, v[3:4]
-; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s16, v1
+; GFX8-NEXT:    v_xor_b32_e32 v9, s19, v10
+; GFX8-NEXT:    v_mov_b32_e32 v10, s19
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
+; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s18, v1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v9, v10, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s2, v7
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v5, v2
@@ -1503,37 +1504,37 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s7, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, s6, v3
+; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v2
+; GFX8-NEXT:    v_mul_lo_u32 v8, s8, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s2, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v6, s6, v2
+; GFX8-NEXT:    v_mul_hi_u32 v6, s8, v2
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s7, v3
-; GFX8-NEXT:    v_mul_hi_u32 v2, s7, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v3
+; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    v_mul_hi_u32 v8, s6, v3
+; GFX8-NEXT:    v_mul_hi_u32 v8, s8, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v9, s7, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, s9, v3
 ; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v8, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s10, v9, v[3:4]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s7
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, s9
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
 ; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s11, v8, v[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s7, v6
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s9, v6
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
@@ -1543,40 +1544,39 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
 ; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s10, v2
 ; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v8
+; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v14
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s10, v11
-; GFX8-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s10, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v16, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v16, s[0:1], 1, v13
+; GFX8-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v14, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v2, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
-; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v6
-; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v6, s1
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[12:13], s[16:17]
+; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v8
+; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v9
+; GFX8-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v6, s8, v9
-; GFX8-NEXT:    v_xor_b32_e32 v7, s8, v7
-; GFX8-NEXT:    v_mov_b32_e32 v8, s8
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v6
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v6, s12, v6
+; GFX8-NEXT:    v_xor_b32_e32 v7, s12, v7
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s12, v6
 ; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v8, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s5
@@ -1599,17 +1599,17 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    s_add_u32 s6, s12, s16
 ; GFX9-NEXT:    s_mov_b32 s17, s16
 ; GFX9-NEXT:    s_addc_u32 s7, s13, s16
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[16:17]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[6:7], s[16:17]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_u32 s12, 0, s6
-; GFX9-NEXT:    s_subb_u32 s13, 0, s7
-; GFX9-NEXT:    s_xor_b64 s[16:17], s[2:3], s[16:17]
+; GFX9-NEXT:    s_sub_u32 s6, 0, s8
+; GFX9-NEXT:    s_subb_u32 s7, 0, s9
+; GFX9-NEXT:    s_xor_b64 s[18:19], s[2:3], s[16:17]
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1617,10 +1617,12 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v4, v[1:2]
+; GFX9-NEXT:    s_ashr_i32 s16, s15, 31
+; GFX9-NEXT:    s_mov_b32 s17, s16
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s13, v3, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
@@ -1642,15 +1644,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v4, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    s_ashr_i32 s12, s15, 31
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s13, v3, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    s_mov_b32 s13, s12
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1669,226 +1671,225 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT:    v_mul_hi_u32 v6, s13, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s9
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s8, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v0, s9, v1
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT:    v_add3_u32 v4, v3, v0, v6
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, s12, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3]
+; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, s13, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s6, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s8, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v1
-; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s10, s8
-; GFX9-NEXT:    s_addc_u32 s1, s11, s8
-; GFX9-NEXT:    s_add_u32 s10, s14, s12
-; GFX9-NEXT:    s_addc_u32 s11, s15, s12
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[12:13]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s11
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
-; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s6, v8
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v12, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v12
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s10, s12
+; GFX9-NEXT:    s_addc_u32 s1, s11, s12
+; GFX9-NEXT:    s_add_u32 s10, s14, s16
+; GFX9-NEXT:    s_addc_u32 s11, s15, s16
+; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s11
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s10
+; GFX9-NEXT:    v_subrev_co_u32_e32 v16, vcc, s8, v9
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v15
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v2, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v13, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v13
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v1
+; GFX9-NEXT:    s_mov_b32 s13, s12
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
 ; GFX9-NEXT:    s_sub_u32 s3, 0, s10
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v13, 0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v14, 0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v13
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_subb_u32 s14, 0, s11
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v10, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v10, v13, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v13, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v12, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v13, v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v11, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v11, v14, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v13, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v15, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v12
+; GFX9-NEXT:    v_mul_lo_u32 v4, v14, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v16, s[0:1]
-; GFX9-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v13, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v12, v1
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], v10, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
+; GFX9-NEXT:    v_mul_lo_u32 v11, v13, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v14, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v13, v2
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v11, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, v11, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v10, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], v13, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], v12, v1, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v10, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v8, s16, v4
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v11, v[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v5, s17, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s14, v10, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s17
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v9, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v4, s2, v7
-; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, v11, v2
+; GFX9-NEXT:    v_add3_u32 v2, v4, v3, v2
+; GFX9-NEXT:    v_add_co_u32_e64 v11, s[0:1], v14, v1
+; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[0:1], v13, v2, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v11, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v9, s18, v5
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v7, s19, v7
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s14, v11, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v10, s19
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s18, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v10, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v5, s2, v8
+; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v11, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, v12, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v6, s2, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, v12, v4
 ; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v7, v5, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s6, v3
-; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, s7, v2
-; GFX9-NEXT:    v_mul_hi_u32 v12, s7, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v3
-; GFX9-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT:    v_mul_hi_u32 v8, v11, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v12, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v10, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s2, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT:    v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s10, v8, v[3:4]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s7
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s11, v10, v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
-; GFX9-NEXT:    v_sub_u32_e32 v6, s7, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s10, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v14
-; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s10, v11
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
-; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v6
-; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v8
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v6, s8, v9
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-NEXT:    v_xor_b32_e32 v7, s8, v7
-; GFX9-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s8, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT:    v_add3_u32 v4, v8, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v11, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v12, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, s9, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, s8, v4
+; GFX9-NEXT:    v_mul_hi_u32 v10, s8, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s9, v3
+; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v10, s9, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v8, s8, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v3, v7
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s10, v11, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s2, v5
+; GFX9-NEXT:    v_add_u32_e32 v8, v10, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
+; GFX9-NEXT:    v_add3_u32 v9, v8, v7, v12
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s10, v9, v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v10, s9
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s8, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s11, v11, v[7:8]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
+; GFX9-NEXT:    v_sub_u32_e32 v7, s9, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v12, vcc, s10, v3
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v13, s[0:1], 0, v7, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v11
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v12
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v13
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s10, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v17, s[0:1], 1, v14
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v18, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v11, v14, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v3, v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v4, s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[12:13], s[16:17]
+; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v10
+; GFX9-NEXT:    v_xor_b32_e32 v4, s1, v9
+; GFX9-NEXT:    v_mov_b32_e32 v9, s1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v9, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v7, s12, v7
+; GFX9-NEXT:    v_xor_b32_e32 v8, s12, v8
+; GFX9-NEXT:    v_mov_b32_e32 v9, s12
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s12, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, v8, v9, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[4:5]
-; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[6:7]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[5:8], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index a40fe242e758c..a9abdc86dcb94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -448,6 +448,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v1
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
@@ -460,136 +461,134 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v1
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v1, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v12, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v13, v6, v12
-; GISEL-NEXT:    v_xor_b32_e32 v12, v7, v12
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v15, v13
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v16, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0
-; GISEL-NEXT:    v_mac_f32_e32 v15, 0x4f800000, v16
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2]
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v15
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v9, v[6:7]
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v1
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v15, 0
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v11, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v7, v[0:1]
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], v14, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v14, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v0
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v8
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v11, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v14, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v19, v8
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v8
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v19, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v18, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v12, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v1
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v9, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v7, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v17, v9, v[5:6]
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v4
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v7, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v9, v5
-; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v8, v12, v[9:10]
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v11, v0
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v14, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v8
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, v0, v8, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v1, v9, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v0
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v7, v0, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v0
+; GISEL-NEXT:    v_xor_b32_e32 v14, v6, v0
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v13
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v14
+; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v10, v5
+; GISEL-NEXT:    v_subbrev_u32_e64 v16, s[4:5], 0, v12, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v16, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v16, v8
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, v6, v1, s[4:5]
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v0
+; GISEL-NEXT:    v_sub_i32_e64 v19, s[4:5], 0, v13
+; GISEL-NEXT:    v_subb_u32_e64 v20, s[4:5], 0, v14, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v18, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v21, v6
+; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v12, v8, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v12, v18, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v19, v21, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v15, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v20, v18, v[6:7]
+; GISEL-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v15, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v21, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v18, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v21, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v21, v5
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT:    v_mul_hi_u32 v7, v18, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v21, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v18, v0
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v21, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v7, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v19, v12, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v20, v7, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v10, v2, v9
+; GISEL-NEXT:    v_mul_lo_u32 v2, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
+; GISEL-NEXT:    v_xor_b32_e32 v11, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v12, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v8, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v8, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
@@ -598,38 +597,38 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v8, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v14, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v12
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v14, v8, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v10, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v11, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v11, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v12
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v14
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v14, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v2, v13
 ; GISEL-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v12
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v12
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v6, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v14
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v6, v13
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64:
@@ -2531,185 +2530,183 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v14, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v9
-; GISEL-NEXT:    v_lshl_b64 v[0:1], s[4:5], v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v14, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v14, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v9, v8
-; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v6, 0
-; GISEL-NEXT:    v_xor_b32_e32 v14, v0, v11
-; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v11
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v14
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v16, v15
-; GISEL-NEXT:    v_mov_b32_e32 v0, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1]
-; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v16
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v9, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v6, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v14
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v9
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v1
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v15, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v16, 0
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_mov_b32_e32 v1, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v6, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v6, v9
-; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], v13, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v18, v16, v[10:11]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v10
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v7
-; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v12, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, v11, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v8, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v19, v7
-; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v7
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v1, v6, v9
-; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v13, v16, v10
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v5, v8, v[1:2]
+; GISEL-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v14, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v12, v0
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], v13, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v1, v6, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v0, v7, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v11, v0, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v0
+; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v0
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v11
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v10
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v9, v5
+; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v6, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v7
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, v15, v1, s[4:5]
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v16, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v16
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v0
+; GISEL-NEXT:    v_sub_i32_e64 v18, s[4:5], 0, v11
+; GISEL-NEXT:    v_subb_u32_e64 v19, s[4:5], 0, v10, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v16
+; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v18, v16, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v19, v17, v[6:7]
+; GISEL-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v20, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v13, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v17, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v14, v20, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v14, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v0
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v6, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v17, v10, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v14, v16, v5
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v16, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v17, v0
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v16, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v18, v7, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v9, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v18, v14, v[1:2]
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v18, v9, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v19, v7, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v12, v13, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v8
-; GISEL-NEXT:    v_mul_lo_u32 v2, v10, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v9, v5
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v8
-; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v0
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v8
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v10, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v10, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v7, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v9, 0
+; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v9, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v5, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v5, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v15, v9, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v12, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v15
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v9, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v14
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v15
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v15, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v10
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v2, v14
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v2, v11
 ; GISEL-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v15
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v15, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v6, v14
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v11
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v10
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v6, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
@@ -3198,131 +3195,131 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
+; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v7, 0
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 0, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8]
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v10, v4
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v12
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v11, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v11, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], 0, v0
+; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v10
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v4, v5, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v11, v5
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v7, v1
+; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v2, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v0
-; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v8
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, v9, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v0
+; GISEL-NEXT:    v_sub_i32_e64 v16, s[4:5], 0, v9
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, v10, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v16, v15, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v3
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v15, v[0:1]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v18, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v12, v[5:6]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v16, v0, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v0, v15, v4
-; GISEL-NEXT:    v_mul_lo_u32 v16, v12, v5
-; GISEL-NEXT:    v_mul_hi_u32 v17, v12, v4
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, v11, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v17
-; GISEL-NEXT:    v_sub_i32_e64 v17, s[4:5], v7, v1
-; GISEL-NEXT:    v_subbrev_u32_e64 v18, s[6:7], 0, v11, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v17, v1
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v18, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, v0, v19, s[6:7]
-; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], v11, v3, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v17, v15, v[5:6]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v19, v0, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v0, v18, v4
+; GISEL-NEXT:    v_mul_lo_u32 v19, v15, v5
+; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, v2, v3, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v2, v15, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v16, v3
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT:    v_and_b32_e32 v16, 0xffffff, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v4, v15, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v2, v18, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v18, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; GISEL-NEXT:    v_mul_hi_u32 v3, v15, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v2
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v15, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v4, 0
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v17, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v18, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v18, v2, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v4, 0
+; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v13, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v5, v[0:1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v14, v4, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v18, v12, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v5, v[0:1]
+; GISEL-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v20, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v4, v[0:1]
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v14, v18, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v5, v2
-; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v4, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v2
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], 0, v16
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], 0, v12
 ; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v5, v2
-; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v6, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v4, v0
+; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v7, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v0
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v13, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v13, v6
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v13, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v5, v0
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v6, v3
+; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v7, v3
 ; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v3
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v4, v2
 ; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v5, v0, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v2
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v8, v6, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v12, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
@@ -3330,7 +3327,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v0
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -3338,38 +3335,38 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v2, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v0, v4
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v4, v[0:1]
 ; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v12, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v10
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v2, v8
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v2, v9
 ; GISEL-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v9
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v9
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v6, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v10
+; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v6, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 68adc75df295c..eebe3c41ed50f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -1993,9 +1993,8 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX6-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GFX6-NEXT:    v_min_i32_e32 v19, -1, v4
+; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v17
@@ -2012,69 +2011,70 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
 ; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
 ; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    buffer_load_dword v19, off, s[0:3], s32
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v7
+; GFX6-NEXT:    v_min_i32_e32 v20, -1, v7
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v8
+; GFX6-NEXT:    v_min_i32_e32 v20, -1, v8
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v24
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v9
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v9
+; GFX6-NEXT:    v_min_i32_e32 v20, -1, v9
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v25
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v10
+; GFX6-NEXT:    v_min_i32_e32 v20, -1, v10
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v26
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v11
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v11
+; GFX6-NEXT:    v_min_i32_e32 v20, -1, v11
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v27
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v12
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v12
+; GFX6-NEXT:    v_min_i32_e32 v20, -1, v12
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v28
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v13
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v13
+; GFX6-NEXT:    v_min_i32_e32 v20, -1, v13
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v29
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v14
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v14
+; GFX6-NEXT:    v_min_i32_e32 v20, -1, v14
+; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v30
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v15
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v17, v16
 ; GFX6-NEXT:    v_min_i32_e32 v17, -1, v15
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v18
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v20
+; GFX6-NEXT:    v_max_i32_e32 v16, v16, v19
 ; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
 ; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2116,9 +2116,8 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GFX8-NEXT:    v_min_i32_e32 v19, -1, v4
+; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v17
@@ -2135,69 +2134,70 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
 ; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
 ; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v7
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v7
+; GFX8-NEXT:    v_min_i32_e32 v20, -1, v7
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, v7, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v8
+; GFX8-NEXT:    v_min_i32_e32 v20, -1, v8
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v24
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v8, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v9
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v9
+; GFX8-NEXT:    v_min_i32_e32 v20, -1, v9
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v25
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, v9, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v10
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v10
+; GFX8-NEXT:    v_min_i32_e32 v20, -1, v10
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v26
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v11
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v11
+; GFX8-NEXT:    v_min_i32_e32 v20, -1, v11
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v27
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, v11, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v12
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v12
+; GFX8-NEXT:    v_min_i32_e32 v20, -1, v12
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v28
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v12, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v13
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v13
+; GFX8-NEXT:    v_min_i32_e32 v20, -1, v13
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v29
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v13, vcc, v13, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v14
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v14
+; GFX8-NEXT:    v_min_i32_e32 v20, -1, v14
+; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v30
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
 ; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, v14, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v15
 ; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v17, v16
 ; GFX8-NEXT:    v_min_i32_e32 v17, -1, v15
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_i32_e32 v16, v16, v20
+; GFX8-NEXT:    v_max_i32_e32 v16, v16, v19
 ; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
 ; GFX8-NEXT:    v_sub_u32_e32 v15, vcc, v15, v16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 93e8a43ba773b..9cb924ba233e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -365,256 +365,256 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_udiv_v2i64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
-; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v5
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v7
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v6
+; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v15, vcc, 0, v7, s[4:5]
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v10
+; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v12
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v10, v11
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x5f7ffffc, v9
+; GISEL-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
+; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v9
+; GISEL-NEXT:    v_mul_f32_e32 v12, 0x2f800000, v10
+; GISEL-NEXT:    v_trunc_f32_e32 v11, v11
+; GISEL-NEXT:    v_trunc_f32_e32 v12, v12
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0xcf800000, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GISEL-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GISEL-NEXT:    v_mul_lo_u32 v17, v13, v12
+; GISEL-NEXT:    v_mul_lo_u32 v18, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v19, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v20, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
+; GISEL-NEXT:    v_mul_lo_u32 v19, v12, v18
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v10, v17
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v18
+; GISEL-NEXT:    v_add_i32_e64 v19, s[4:5], v19, v20
+; GISEL-NEXT:    v_mul_lo_u32 v19, v8, v9
+; GISEL-NEXT:    v_mul_lo_u32 v20, v14, v9
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v20, v16
+; GISEL-NEXT:    v_mul_hi_u32 v20, v8, v9
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v11, v19
+; GISEL-NEXT:    v_mul_lo_u32 v21, v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v20, s[6:7], v20, v21
+; GISEL-NEXT:    v_mul_hi_u32 v21, v9, v19
+; GISEL-NEXT:    v_add_i32_e64 v20, s[8:9], v20, v21
+; GISEL-NEXT:    v_mul_hi_u32 v19, v11, v19
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v18
+; GISEL-NEXT:    v_mul_lo_u32 v20, v11, v16
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], v20, v19
+; GISEL-NEXT:    v_mul_lo_u32 v20, v12, v17
+; GISEL-NEXT:    v_add_i32_e64 v18, s[12:13], v20, v18
+; GISEL-NEXT:    v_mul_hi_u32 v20, v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v19, s[14:15], v19, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v18, s[16:17], v18, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v20, s[6:7], v20, v21
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[14:15]
+; GISEL-NEXT:    v_add_i32_e64 v21, s[6:7], v21, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[16:17]
+; GISEL-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v22
+; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v19
+; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v16
+; GISEL-NEXT:    v_mul_hi_u32 v17, v12, v17
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v21, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v23, v19
+; GISEL-NEXT:    v_mul_lo_u32 v20, v8, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; GISEL-NEXT:    v_mul_hi_u32 v18, v8, v9
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v15, v15, v10
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v16, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v16, v13, v10
+; GISEL-NEXT:    v_addc_u32_e64 v12, vcc, v12, v17, s[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v20
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v19
+; GISEL-NEXT:    v_mul_lo_u32 v13, v13, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v11, v20
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v18
+; GISEL-NEXT:    v_mul_lo_u32 v18, v12, v19
+; GISEL-NEXT:    v_mul_hi_u32 v19, v12, v19
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v8
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v8
+; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v17
+; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v13
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v12, v13
+; GISEL-NEXT:    v_add_i32_e64 v17, s[8:9], v18, v17
+; GISEL-NEXT:    v_mul_hi_u32 v18, v10, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], v20, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[10:11]
+; GISEL-NEXT:    v_add_i32_e64 v18, s[8:9], v19, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v19, s[4:5], v20, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v20
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT:    v_add_i32_e64 v17, s[4:5], v18, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v14, v18
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v19
+; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], v1, v11, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, v12, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v13, v0, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v11
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v11, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v7
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v4, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_lo_u32 v18, v3, v10
+; GISEL-NEXT:    v_mul_hi_u32 v19, v2, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v10
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v14
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v15
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v11, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v12, v13, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT:    v_mul_lo_u32 v15, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v20, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v2, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v11
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v13, v9
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v18, v15
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v20, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v19
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v10, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v20, v16
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v3, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v9
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v9, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v16, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v18, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v19, v6, v10
+; GISEL-NEXT:    v_mul_lo_u32 v20, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v9
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v16
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], 1, v10
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[8:9], v2, v19
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], 1, v15
+; GISEL-NEXT:    v_add_i32_e64 v8, s[12:13], v8, v13
+; GISEL-NEXT:    v_add_i32_e64 v13, s[12:13], 1, v16
+; GISEL-NEXT:    v_add_i32_e64 v11, s[14:15], v11, v12
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[14:15], v0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[16:17], v2, v6
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[18:19], v0, v4
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[20:21], v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[22:23], v0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v4, v6, v11
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v6, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v2, s[6:7], 0, v11, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[14:15]
+; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v17, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[16:17]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[6:7], v20, v4
+; GISEL-NEXT:    v_addc_u32_e64 v20, s[6:7], 0, v0, s[10:11]
+; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v12, v18
+; GISEL-NEXT:    v_addc_u32_e64 v18, s[6:7], 0, v2, s[12:13]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[6:7], v4, v14
+; GISEL-NEXT:    v_subb_u32_e64 v14, s[6:7], v1, v12, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[6:7], v1, v12
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[6:7], v3, v4, s[8:9]
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[22:23]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v14, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[10:11], v14, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v1, vcc, v1, v5, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19]
+; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, v17, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[8:9]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v1
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[8:9], 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v15, v19, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v16, v13, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v0, v20, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v2, v18, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v10, v3, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v11, v5, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64:
@@ -1250,256 +1250,256 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
 ; GISEL-NEXT:    v_lshl_b64 v[7:8], s[4:5], v4
 ; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
-; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v7
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v8
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, 0, v7
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v5
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v4
+; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v15, vcc, 0, v5, s[4:5]
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v10
+; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v12
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v10, v11
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x5f7ffffc, v9
+; GISEL-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
+; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v9
+; GISEL-NEXT:    v_mul_f32_e32 v12, 0x2f800000, v10
+; GISEL-NEXT:    v_trunc_f32_e32 v11, v11
+; GISEL-NEXT:    v_trunc_f32_e32 v12, v12
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0xcf800000, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GISEL-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v12
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GISEL-NEXT:    v_mul_lo_u32 v17, v13, v12
+; GISEL-NEXT:    v_mul_lo_u32 v18, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v19, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v20, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
+; GISEL-NEXT:    v_mul_lo_u32 v19, v12, v18
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v10, v17
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v18
+; GISEL-NEXT:    v_add_i32_e64 v19, s[4:5], v19, v20
+; GISEL-NEXT:    v_mul_lo_u32 v19, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v20, v14, v9
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v20, v16
+; GISEL-NEXT:    v_mul_hi_u32 v20, v6, v9
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v11, v19
+; GISEL-NEXT:    v_mul_lo_u32 v21, v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v20, s[6:7], v20, v21
+; GISEL-NEXT:    v_mul_hi_u32 v21, v9, v19
+; GISEL-NEXT:    v_add_i32_e64 v20, s[8:9], v20, v21
+; GISEL-NEXT:    v_mul_hi_u32 v19, v11, v19
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v18
+; GISEL-NEXT:    v_mul_lo_u32 v20, v11, v16
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], v20, v19
+; GISEL-NEXT:    v_mul_lo_u32 v20, v12, v17
+; GISEL-NEXT:    v_add_i32_e64 v18, s[12:13], v20, v18
+; GISEL-NEXT:    v_mul_hi_u32 v20, v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v19, s[14:15], v19, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v18, s[16:17], v18, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v20, s[6:7], v20, v21
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[14:15]
+; GISEL-NEXT:    v_add_i32_e64 v21, s[6:7], v21, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[16:17]
+; GISEL-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v22
+; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v19
+; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v16
+; GISEL-NEXT:    v_mul_hi_u32 v17, v12, v17
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v21, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v23, v19
+; GISEL-NEXT:    v_mul_lo_u32 v20, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v15, v15, v10
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v16, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v16, v13, v10
+; GISEL-NEXT:    v_addc_u32_e64 v12, vcc, v12, v17, s[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v20
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v11
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v19
+; GISEL-NEXT:    v_mul_lo_u32 v13, v13, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v11, v20
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
+; GISEL-NEXT:    v_mul_lo_u32 v18, v12, v19
+; GISEL-NEXT:    v_mul_hi_u32 v19, v12, v19
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v6
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v6
+; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v17
+; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v6
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v13
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v12, v13
+; GISEL-NEXT:    v_add_i32_e64 v17, s[8:9], v18, v17
+; GISEL-NEXT:    v_mul_hi_u32 v18, v10, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], v20, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[10:11]
+; GISEL-NEXT:    v_add_i32_e64 v18, s[8:9], v19, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v19, s[4:5], v20, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v20
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT:    v_add_i32_e64 v17, s[4:5], v18, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v14, v18
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v19
+; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], v1, v11, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, v12, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v7
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v6
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v13, v0, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v11
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v11, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_lo_u32 v18, v3, v10
+; GISEL-NEXT:    v_mul_hi_u32 v19, v2, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v10
+; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v6, v14
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v15
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v11, v6, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v12, v13, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v6
+; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT:    v_mul_lo_u32 v15, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v20, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v2, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v11
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v13, v9
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v18, v15
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v20, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v19
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v10, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v20, v16
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v6
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v3, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v9
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v9, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v4
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v9
+; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v9
+; GISEL-NEXT:    v_mul_lo_u32 v19, v4, v10
+; GISEL-NEXT:    v_mul_lo_u32 v20, v5, v10
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v9
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v16
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], 1, v10
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[8:9], v2, v19
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], 1, v15
+; GISEL-NEXT:    v_add_i32_e64 v6, s[12:13], v6, v13
+; GISEL-NEXT:    v_add_i32_e64 v13, s[12:13], 1, v16
+; GISEL-NEXT:    v_add_i32_e64 v11, s[14:15], v11, v12
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[14:15], v0, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[16:17], v2, v4
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[18:19], v0, v7
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[20:21], v2, v4
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[22:23], v0, v7
+; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v2, v4, v11
+; GISEL-NEXT:    v_add_i32_e64 v4, s[24:25], v17, v12
+; GISEL-NEXT:    v_addc_u32_e64 v7, s[6:7], 0, v11, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], v20, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[14:15]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[6:7], v4, v18
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[6:7], v1, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[16:17]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v17, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[14:15], v17, v8
+; GISEL-NEXT:    v_addc_u32_e64 v17, s[10:11], 0, v0, s[10:11]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[18:19]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[10:11], v1, v8
+; GISEL-NEXT:    v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[22:23]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
+; GISEL-NEXT:    v_subb_u32_e64 v14, vcc, v3, v2, s[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v3, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v14, v5
+; GISEL-NEXT:    v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], v14, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s[14:15]
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v14, v4, s[8:9]
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v18, s[6:7]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[8:9], 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v15, v19, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v16, v13, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v0, v17, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v7, v1, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v10, v3, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v11, v5, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom:
@@ -1901,259 +1901,259 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_udiv_v2i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v1
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v8
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
-; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v6
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
+; GISEL-NEXT:    s_bfe_i32 s12, 1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s13, 1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s14, 1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s15, 1, 0x10000
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, 0, v1
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], 0, v0
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v7
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
+; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
 ; GISEL-NEXT:    v_trunc_f32_e32 v11, v11
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v8
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v12
-; GISEL-NEXT:    v_mul_lo_u32 v15, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v12
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v19, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v4
+; GISEL-NEXT:    v_mul_lo_u32 v4, v6, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v11
+; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v12
+; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v12
+; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v19, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v14
+; GISEL-NEXT:    v_mul_hi_u32 v20, v12, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v17
-; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v17
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v13
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v14
+; GISEL-NEXT:    v_mul_lo_u32 v18, v11, v17
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v4, v16
+; GISEL-NEXT:    v_mul_hi_u32 v4, v5, v17
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v11, v17
-; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v6, v16
-; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v16
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v15, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v18
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v5, v13
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v18, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v12, v16
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v18, v4
+; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v16
 ; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v16
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v6, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
-; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v6, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v20, vcc, v20, v6
-; GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v2
-; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s5, 1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s6, 1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s7, 1, 0x10000
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v18, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_mov_b32_e32 v15, s4
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v20, v18
-; GISEL-NEXT:    v_mov_b32_e32 v19, s5
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v16
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; GISEL-NEXT:    v_mov_b32_e32 v16, s6
-; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v14, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v2
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v2
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v2
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v8
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v2, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
-; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v15, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v11, v13
+; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v4, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v5, v13
+; GISEL-NEXT:    v_add_i32_e64 v17, s[10:11], v19, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[6:7], v4, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[10:11]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[8:9], v17, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_mul_lo_u32 v10, v2, v4
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
-; GISEL-NEXT:    v_mul_lo_u32 v18, v11, v9
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v12, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v20, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v18, v20
+; GISEL-NEXT:    v_mov_b32_e32 v20, s12
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v4, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, s13
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
+; GISEL-NEXT:    v_mov_b32_e32 v19, s14
+; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v12, v15
+; GISEL-NEXT:    v_mov_b32_e32 v15, s15
+; GISEL-NEXT:    v_mul_hi_u32 v16, v7, v16
+; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
+; GISEL-NEXT:    v_add_i32_e64 v5, s[8:9], v5, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
-; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; GISEL-NEXT:    v_mov_b32_e32 v18, s7
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v18, v6, v12
+; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v5
+; GISEL-NEXT:    v_addc_u32_e64 v7, vcc, v7, v14, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v5
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v13, s[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v18
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v17
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v18
+; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v18
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v16
+; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v17
+; GISEL-NEXT:    v_mul_hi_u32 v17, v11, v17
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v6
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v6
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
+; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v8
+; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v14, v18
+; GISEL-NEXT:    v_mul_lo_u32 v18, v11, v8
+; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v16, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v5, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT:    v_add_i32_e64 v17, s[10:11], v18, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v14, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[10:11]
+; GISEL-NEXT:    v_add_i32_e64 v16, s[8:9], v17, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v17, s[4:5], v18, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v2
-; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v2, 0, v2
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v13
+; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v14, v17
+; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v3, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, 0, v10
+; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v17, v2, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v6, v9
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v12
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v7, vcc, v11, v8, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, 0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v7
+; GISEL-NEXT:    v_mul_lo_u32 v18, 0, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v17, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v16, v12
+; GISEL-NEXT:    v_add_i32_e64 v5, s[8:9], v18, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
+; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v5, s[8:9], v5, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v18, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v13, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v2
-; GISEL-NEXT:    v_mul_lo_u32 v12, 0, v2
-; GISEL-NEXT:    v_mul_hi_u32 v13, v1, v2
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v17, v3, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v10
-; GISEL-NEXT:    v_mul_lo_u32 v9, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v2
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], 1, v7
-; GISEL-NEXT:    v_addc_u32_e64 v17, s[6:7], 0, v5, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v0, v11
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[8:9], 0, v10, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, v15, v13, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v16, v15, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], 1, v12
-; GISEL-NEXT:    v_addc_u32_e64 v16, s[4:5], 0, v14, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 0, v10
-; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v3
-; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
-; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 1, v8
-; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, v17, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e64 v9, s[6:7], 0, v9
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v6, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v12, v15, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v8, v0, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v14, v16, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, 0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v16, v1, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v18, 0, v5
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v9
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v13
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], 1, v5
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[8:9], v2, v17
+; GISEL-NEXT:    v_add_i32_e64 v17, s[10:11], 1, v12
+; GISEL-NEXT:    v_add_i32_e64 v6, s[12:13], v6, v10
+; GISEL-NEXT:    v_add_i32_e64 v10, s[12:13], 1, v13
+; GISEL-NEXT:    v_add_i32_e64 v7, s[14:15], v7, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[14:15], v3, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[16:17], v2, v0
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[18:19], v3, v1
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[20:21], v2, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[22:23], v3, v1
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v0, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
+; GISEL-NEXT:    v_addc_u32_e64 v0, s[6:7], 0, v7, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[14:15]
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v14, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[16:17]
+; GISEL-NEXT:    v_add_i32_e64 v3, s[6:7], v18, v3
+; GISEL-NEXT:    v_addc_u32_e64 v18, s[6:7], 0, v1, s[10:11]
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v16
+; GISEL-NEXT:    v_addc_u32_e64 v16, s[6:7], 0, v0, s[12:13]
+; GISEL-NEXT:    v_add_i32_e64 v3, s[6:7], v3, v11
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[6:7], 0, v8, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v11
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[10:11], 0, v3, s[8:9]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[10:11], 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[22:23]
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v20, v2, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v8
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v3
+; GISEL-NEXT:    v_subbrev_u32_e64 v8, vcc, 0, v8, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v19, v14, s[10:11]
+; GISEL-NEXT:    v_subbrev_u32_e64 v8, vcc, 0, v8, s[18:19]
+; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21]
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[8:9], 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v15, v20, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v17, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v13, v10, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v18, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v0, v16, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v9, v2, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v4, s[8:9]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64_24bit:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 9d8f051b83036..67c87beec262d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -1092,96 +1092,96 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v1, v2, s[0:1]
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s14
 ; GFX8-NEXT:    v_subb_u32_e32 v4, vcc, v0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v1
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s12, v7
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_subbrev_u32_e64 v11, s[0:1], 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
+; GFX8-NEXT:    v_trunc_f32_e32 v14, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v14
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v0
-; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v5
-; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v6, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v0
+; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v6, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v15, v[1:2]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v15, 0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v14, v14
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v15, v0
-; GFX8-NEXT:    v_mul_lo_u32 v17, v12, v1
-; GFX8-NEXT:    v_mul_hi_u32 v4, v12, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v2, v16, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v14, v[1:2]
+; GFX8-NEXT:    v_add_u32_e64 v17, s[0:1], 1, v12
+; GFX8-NEXT:    v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v15, v[1:2]
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v4, v3, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v14, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v4, v15, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v17, v2
-; GFX8-NEXT:    v_mul_hi_u32 v17, v12, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v13
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v14, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT:    v_mul_hi_u32 v1, v15, v1
-; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v2, v15, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v14, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, v14, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
+; GFX8-NEXT:    v_mul_hi_u32 v4, v15, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_mul_hi_u32 v1, v14, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, v15, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, v14, v1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v12, v17, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v14, v18, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v14, v[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v13, v18, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v15, v[3:4]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v16
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v10, v19, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v6, v15, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, v12, v3
+; GFX8-NEXT:    v_mul_lo_u32 v6, v14, v2
+; GFX8-NEXT:    v_mul_lo_u32 v9, v15, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v7, v12, v2
+; GFX8-NEXT:    v_mul_hi_u32 v7, v15, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v11, v20, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v6, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v6, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v7, v15, v3
-; GFX8-NEXT:    v_mul_hi_u32 v2, v15, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, v14, v3
+; GFX8-NEXT:    v_mul_hi_u32 v2, v14, v2
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
-; GFX8-NEXT:    v_mul_hi_u32 v9, v12, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, v15, v3
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v2, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v9
-; GFX8-NEXT:    v_mul_hi_u32 v3, v15, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, v14, v3
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v7, v6
 ; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v6
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v12, v2
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[0:1], v15, v3, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v15, v2
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[0:1], v14, v3, s[0:1]
 ; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
@@ -1221,27 +1221,27 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[0:1]
 ; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s14, v7
 ; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v8
+; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v3, vcc
-; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v14
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s14, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
-; GFX8-NEXT:    v_subbrev_u32_e64 v14, s[0:1], 0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v12
+; GFX8-NEXT:    v_subrev_u32_e32 v18, vcc, s14, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v16, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v16, s[0:1], 1, v13
+; GFX8-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v13, v16, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v12, v14, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v15, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v14, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v11, v18, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v12, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v8, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s5
@@ -1298,6 +1298,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; GFX9-NEXT:    s_sub_u32 s2, 0, s14
@@ -1328,7 +1329,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s9, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1340,178 +1341,178 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v5, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v6, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v6, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v6, v[1:2]
+; GFX9-NEXT:    v_add3_u32 v8, v3, v0, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s12, v8, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s8, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s13, v5, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s13
-; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v4, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v8
-; GFX9-NEXT:    v_sub_u32_e32 v0, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s8, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s13, v6, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_subb_co_u32_e64 v10, s[0:1], v4, v2, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v10
+; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v1, v2, s[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s14
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v1
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s12, v7
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[0:1], 0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v0
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v6, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v12, 0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v2
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v15, v[1:2]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v2, v3, s[0:1]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s15
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s14
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v12, vcc, s12, v9
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v13, s[0:1], 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], 1, v6
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v15, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v15
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v1
+; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v15, v0
-; GFX9-NEXT:    v_mul_lo_u32 v17, v12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, v12, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v16, 0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v3, v17, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
+; GFX9-NEXT:    v_add_co_u32_e64 v18, s[0:1], 1, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v16, v[2:3]
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v15, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v17, v2
-; GFX9-NEXT:    v_mul_hi_u32 v17, v12, v1
+; GFX9-NEXT:    v_mul_lo_u32 v7, v16, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v20, vcc, s12, v12
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v21, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v3, v16, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, v15, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v15, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, v7, v3
+; GFX9-NEXT:    v_mul_hi_u32 v7, v16, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v15, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v4, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v17
-; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, 1, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s12, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v0
-; GFX9-NEXT:    v_add3_u32 v1, v4, v2, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v1, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v17, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v13, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v16
-; GFX9-NEXT:    v_mul_lo_u32 v5, v15, v2
-; GFX9-NEXT:    v_mul_lo_u32 v6, v12, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v19, s[0:1]
-; GFX9-NEXT:    v_mul_hi_u32 v10, v12, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v20, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v16, v1
+; GFX9-NEXT:    v_add3_u32 v2, v4, v3, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v7, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v2, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v18, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2]
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v19, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v7, v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v14, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v17
+; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v12, v20, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v13, v21, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v6, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v10, v15, v3
-; GFX9-NEXT:    v_mul_hi_u32 v2, v15, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_mul_hi_u32 v6, v12, v3
+; GFX9-NEXT:    v_mul_lo_u32 v12, v15, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v10, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v6
+; GFX9-NEXT:    v_add_u32_e32 v6, v8, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v15, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v12, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v6
+; GFX9-NEXT:    v_add_u32_e32 v8, v12, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, v10, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add3_u32 v3, v6, v5, v3
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v12, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v15, v3, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v6, s10, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v7, s11, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v12, s11, v3
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v7, v2
+; GFX9-NEXT:    v_add3_u32 v4, v8, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v7, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], v15, v4, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v3
+; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s11, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v6, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v6
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v6, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v10, 0
+; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v4
 ; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX9-NEXT:    v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s14, v8, v[3:4]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s15, v10, v[6:7]
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v2
-; GFX9-NEXT:    v_subb_co_u32_e64 v9, s[0:1], v9, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v9
-; GFX9-NEXT:    v_sub_u32_e32 v2, s11, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s14, v7
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v2, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, 1, v14
-; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s14, v11
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v16, s[0:1], 0, v2, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v14, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v11, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v4
+; GFX9-NEXT:    v_mul_hi_u32 v13, s11, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v8, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v3, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0
+; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX9-NEXT:    v_add3_u32 v10, v7, v12, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v11, s11
+; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8]
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, s10, v3
+; GFX9-NEXT:    v_subb_co_u32_e64 v11, s[0:1], v11, v7, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v11
+; GFX9-NEXT:    v_sub_u32_e32 v3, s11, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v11
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v12, vcc, s14, v8
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v13, s[0:1], 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v10, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v12
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v13
+; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s14, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v17, s[0:1], 1, v14
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v14, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v15, v18, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v13, v20, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[4:5]
-; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[6:7]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[5:8], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 61a9f261c8eb1..02acedf2c7ac7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -359,254 +359,254 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_urem_v2i64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
-; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v5
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v7
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v6
+; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v15, vcc, 0, v7, s[4:5]
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v10
+; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v12
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v10, v11
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x5f7ffffc, v9
+; GISEL-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
+; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v9
+; GISEL-NEXT:    v_mul_f32_e32 v12, 0x2f800000, v10
+; GISEL-NEXT:    v_trunc_f32_e32 v11, v11
+; GISEL-NEXT:    v_trunc_f32_e32 v12, v12
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0xcf800000, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GISEL-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GISEL-NEXT:    v_mul_lo_u32 v17, v13, v12
+; GISEL-NEXT:    v_mul_lo_u32 v18, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v19, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v20, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
+; GISEL-NEXT:    v_mul_lo_u32 v19, v12, v18
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v10, v17
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v18
+; GISEL-NEXT:    v_add_i32_e64 v19, s[4:5], v19, v20
+; GISEL-NEXT:    v_mul_lo_u32 v19, v8, v9
+; GISEL-NEXT:    v_mul_lo_u32 v20, v14, v9
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v20, v16
+; GISEL-NEXT:    v_mul_hi_u32 v20, v8, v9
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v11, v19
+; GISEL-NEXT:    v_mul_lo_u32 v21, v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v20, s[6:7], v20, v21
+; GISEL-NEXT:    v_mul_hi_u32 v21, v9, v19
+; GISEL-NEXT:    v_add_i32_e64 v20, s[8:9], v20, v21
+; GISEL-NEXT:    v_mul_hi_u32 v19, v11, v19
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v18
+; GISEL-NEXT:    v_mul_lo_u32 v20, v11, v16
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], v20, v19
+; GISEL-NEXT:    v_mul_lo_u32 v20, v12, v17
+; GISEL-NEXT:    v_add_i32_e64 v18, s[12:13], v20, v18
+; GISEL-NEXT:    v_mul_hi_u32 v20, v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v19, s[14:15], v19, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v18, s[16:17], v18, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v20, s[6:7], v20, v21
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[14:15]
+; GISEL-NEXT:    v_add_i32_e64 v21, s[6:7], v21, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[16:17]
+; GISEL-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v22
+; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v19
+; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v16
+; GISEL-NEXT:    v_mul_hi_u32 v17, v12, v17
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v21, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v23, v19
+; GISEL-NEXT:    v_mul_lo_u32 v20, v8, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; GISEL-NEXT:    v_mul_hi_u32 v18, v8, v9
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v15, v15, v10
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v16, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v16, v13, v10
+; GISEL-NEXT:    v_addc_u32_e64 v12, vcc, v12, v17, s[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v20
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v19
+; GISEL-NEXT:    v_mul_lo_u32 v13, v13, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v11, v20
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v18
+; GISEL-NEXT:    v_mul_lo_u32 v18, v12, v19
+; GISEL-NEXT:    v_mul_hi_u32 v19, v12, v19
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v8
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v8
+; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v17
+; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v13
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v12, v13
+; GISEL-NEXT:    v_add_i32_e64 v17, s[8:9], v18, v17
+; GISEL-NEXT:    v_mul_hi_u32 v18, v10, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], v20, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[10:11]
+; GISEL-NEXT:    v_add_i32_e64 v18, s[8:9], v19, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v19, s[4:5], v20, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v20
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT:    v_add_i32_e64 v17, s[4:5], v18, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v14, v18
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v19
+; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v4, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], v0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v1, v5, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v7
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v4, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_lo_u32 v18, v3, v10
+; GISEL-NEXT:    v_mul_hi_u32 v19, v2, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v10
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v14
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v15
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v11, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v12, v13, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT:    v_mul_lo_u32 v15, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v20, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v2, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v11
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v13, v9
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v18, v15
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v20, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v19
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v10, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v20, v16
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v16, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v18, v6, v10
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v18
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v13
+; GISEL-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v12
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v2, v6
+; GISEL-NEXT:    v_sub_i32_e64 v12, s[10:11], v0, v4
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[12:13], v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v13, v6
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[14:15], v12, v4
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[16:17], v13, v6
+; GISEL-NEXT:    v_add_i32_e64 v8, s[18:19], v17, v8
+; GISEL-NEXT:    v_add_i32_e64 v11, s[18:19], v19, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v9
+; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v11, v10
+; GISEL-NEXT:    v_subb_u32_e64 v10, s[6:7], v1, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[6:7], v1, v8
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v7
 ; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], v8, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11]
+; GISEL-NEXT:    v_subb_u32_e64 v1, vcc, v1, v5, s[10:11]
+; GISEL-NEXT:    v_subbrev_u32_e64 v19, vcc, 0, v3, s[12:13]
+; GISEL-NEXT:    v_subb_u32_e64 v3, vcc, v3, v7, s[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v15, s[8:9]
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v5
+; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v19, v7
+; GISEL-NEXT:    v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v16, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v17, s[8:9]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v5
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[8:9], 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v18, v1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64:
@@ -1096,26 +1096,26 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_urem_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s6, 0x12d8fb
+; GISEL-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT:    s_sub_u32 s7, 0, 0x12d8fb
+; GISEL-NEXT:    s_sub_u32 s5, 0, 0x12d8fb
 ; GISEL-NEXT:    v_madmk_f32 v7, v5, 0x4f800000, v6
-; GISEL-NEXT:    s_subb_u32 s8, 0, 0
-; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    s_bfe_i32 s7, 1, 0x10000
 ; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GISEL-NEXT:    v_mov_b32_e32 v5, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s7
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    s_sub_u32 s9, 0, 0x12d8fb
+; GISEL-NEXT:    s_sub_u32 s7, 0, 0x12d8fb
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    s_subb_u32 s10, 0, 0
-; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; GISEL-NEXT:    s_subb_u32 s8, 0, 0
+; GISEL-NEXT:    s_bfe_i32 s9, 1, 0x10000
 ; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
-; GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; GISEL-NEXT:    v_mov_b32_e32 v10, s9
 ; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
@@ -1123,119 +1123,119 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_mul_lo_u32 v11, s7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, s5, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v9
-; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, s8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v15, s7, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, s9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v17, s10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v18, s9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v9
+; GISEL-NEXT:    v_mul_lo_u32 v13, s5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, s6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v15, s5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v16, s7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v17, s8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v18, s7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v16
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v16
+; GISEL-NEXT:    v_mul_hi_u32 v20, v6, v16
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v9, v16
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v6, v12
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v11
-; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v11
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v11
+; GISEL-NEXT:    v_mul_hi_u32 v21, v7, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v12
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v17, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
+; GISEL-NEXT:    v_mul_lo_u32 v22, v6, v12
+; GISEL-NEXT:    v_mul_lo_u32 v23, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v24, v6, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v23, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v21
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v24
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v19
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v22, v17
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v23, v20
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, s7, v7
-; GISEL-NEXT:    v_mul_lo_u32 v13, s8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, s7, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, s5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v13, s6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, s5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v16
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v15, s10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v16, s9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v17, s7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v15, s8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v16, s7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v17, s5, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_mul_lo_u32 v20, s7, v9
+; GISEL-NEXT:    v_mul_lo_u32 v21, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v22, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; GISEL-NEXT:    v_mul_lo_u32 v17, s9, v9
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v20
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v15
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v16
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v13
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v16, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v20, v6, v14
+; GISEL-NEXT:    v_mul_lo_u32 v23, v9, v14
+; GISEL-NEXT:    v_mul_hi_u32 v24, v6, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v14
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT:    v_mul_hi_u32 v17, v6, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
+; GISEL-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v23, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v20, v22
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v24
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v15
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v21, v19
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v23, v20
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v16
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v18, v17
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v7
@@ -1252,50 +1252,50 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v2, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v19, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v20, v2, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v16, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v19, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v19, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v19
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v18, v11
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v20
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v18, v12
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v19, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, s6
-; GISEL-NEXT:    v_mul_hi_u32 v7, s6, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_mul_lo_u32 v12, v6, s6
-; GISEL-NEXT:    v_mul_hi_u32 v6, s6, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, s6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, s6
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, s4
+; GISEL-NEXT:    v_mul_hi_u32 v7, s4, v7
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_mul_lo_u32 v14, v6, s4
+; GISEL-NEXT:    v_mul_hi_u32 v6, s4, v6
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, s4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v9, s4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
 ; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v7, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v7
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v12
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v14
 ; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v6
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v4
@@ -1339,16 +1339,16 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-LABEL: v_urem_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s6, 0x12d8fb
+; CGP-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_mov_b32 s7, 0xffed2705
-; CGP-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; CGP-NEXT:    s_mov_b32 s5, 0xffed2705
+; CGP-NEXT:    s_bfe_i32 s6, 1, 0x10000
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v7, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v8, 0
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v5
-; CGP-NEXT:    v_mov_b32_e32 v5, s4
+; CGP-NEXT:    v_mov_b32_e32 v5, s6
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
@@ -1363,14 +1363,14 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v9
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, s7
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, s5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, s7
-; CGP-NEXT:    v_mul_lo_u32 v12, v6, s7
-; CGP-NEXT:    v_mul_hi_u32 v13, s7, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, s5
+; CGP-NEXT:    v_mul_lo_u32 v12, v6, s5
+; CGP-NEXT:    v_mul_hi_u32 v13, s5, v6
 ; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, s7
-; CGP-NEXT:    v_mul_hi_u32 v15, s7, v7
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, s5
+; CGP-NEXT:    v_mul_hi_u32 v15, s5, v7
 ; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v11, v7
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CGP-NEXT:    v_mul_lo_u32 v13, v8, v12
@@ -1381,54 +1381,54 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_hi_u32 v17, v7, v14
 ; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
 ; CGP-NEXT:    v_mul_lo_u32 v18, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v19, v7, v11
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT:    v_mul_lo_u32 v15, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v17, v6, v10
+; CGP-NEXT:    v_mul_lo_u32 v19, v8, v10
+; CGP-NEXT:    v_mul_hi_u32 v20, v6, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v8, v10
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v18, v16
-; CGP-NEXT:    v_mul_hi_u32 v18, v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v21, v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v22, v9, v11
+; CGP-NEXT:    v_mul_hi_u32 v23, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v9, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v21
+; CGP-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v22, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v20
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v23
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v21, v15
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v22, v17
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, s7
-; CGP-NEXT:    v_mul_hi_u32 v12, s7, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, s5
+; CGP-NEXT:    v_mul_hi_u32 v12, s5, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
 ; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v7, s7
-; CGP-NEXT:    v_mul_hi_u32 v13, s7, v7
-; CGP-NEXT:    v_mul_lo_u32 v14, v8, s7
+; CGP-NEXT:    v_mul_lo_u32 v11, v7, s5
+; CGP-NEXT:    v_mul_hi_u32 v13, s5, v7
+; CGP-NEXT:    v_mul_lo_u32 v14, v8, s5
 ; CGP-NEXT:    v_mul_lo_u32 v15, v8, v10
 ; CGP-NEXT:    v_mul_hi_u32 v16, v6, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v8, v10
-; CGP-NEXT:    v_mul_lo_u32 v17, v9, s7
+; CGP-NEXT:    v_mul_lo_u32 v17, v9, s5
 ; CGP-NEXT:    v_mul_lo_u32 v18, v9, v11
 ; CGP-NEXT:    v_mul_hi_u32 v19, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v9, v11
@@ -1437,41 +1437,41 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
 ; CGP-NEXT:    v_mul_lo_u32 v14, v6, v12
-; CGP-NEXT:    v_mul_lo_u32 v17, v7, v13
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
 ; CGP-NEXT:    v_mul_lo_u32 v17, v8, v12
-; CGP-NEXT:    v_mul_hi_u32 v19, v6, v12
+; CGP-NEXT:    v_mul_hi_u32 v20, v6, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v21, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v22, v9, v13
+; CGP-NEXT:    v_mul_hi_u32 v23, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v17, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v17, s[4:5], v17, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v18, v21
+; CGP-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v22, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v20
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v18, vcc, v18, v19
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v23
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v16
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v21, v18
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v22, v19
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v10, v1, v6
@@ -1515,15 +1515,15 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v6, s6
-; CGP-NEXT:    v_mul_hi_u32 v6, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v12, v6, s4
+; CGP-NEXT:    v_mul_hi_u32 v6, s4, v6
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v7, s6
-; CGP-NEXT:    v_mul_hi_u32 v7, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v13, v7, s4
+; CGP-NEXT:    v_mul_hi_u32 v7, s4, v7
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CGP-NEXT:    v_mul_lo_u32 v8, v8, s6
-; CGP-NEXT:    v_mul_lo_u32 v9, v9, s6
+; CGP-NEXT:    v_mul_lo_u32 v8, v8, s4
+; CGP-NEXT:    v_mul_lo_u32 v9, v9, s4
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
@@ -1756,256 +1756,256 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[7:8], s[4:5], v4
-; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
-; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
+; GISEL-NEXT:    v_lshl_b64 v[6:7], s[4:5], v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v5
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v7
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v6
+; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v15, vcc, 0, v7, s[4:5]
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v10
+; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v12
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v10, v11
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x5f7ffffc, v9
+; GISEL-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
+; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v9
+; GISEL-NEXT:    v_mul_f32_e32 v12, 0x2f800000, v10
+; GISEL-NEXT:    v_trunc_f32_e32 v11, v11
+; GISEL-NEXT:    v_trunc_f32_e32 v12, v12
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0xcf800000, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GISEL-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v12
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v11
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GISEL-NEXT:    v_mul_lo_u32 v17, v13, v12
+; GISEL-NEXT:    v_mul_lo_u32 v18, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v19, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v20, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
+; GISEL-NEXT:    v_mul_lo_u32 v19, v12, v18
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v10, v17
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v18
+; GISEL-NEXT:    v_add_i32_e64 v19, s[4:5], v19, v20
+; GISEL-NEXT:    v_mul_lo_u32 v19, v8, v9
+; GISEL-NEXT:    v_mul_lo_u32 v20, v14, v9
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v20, v16
+; GISEL-NEXT:    v_mul_hi_u32 v20, v8, v9
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v11, v19
+; GISEL-NEXT:    v_mul_lo_u32 v21, v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v20, s[6:7], v20, v21
+; GISEL-NEXT:    v_mul_hi_u32 v21, v9, v19
+; GISEL-NEXT:    v_add_i32_e64 v20, s[8:9], v20, v21
+; GISEL-NEXT:    v_mul_hi_u32 v19, v11, v19
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v18
+; GISEL-NEXT:    v_mul_lo_u32 v20, v11, v16
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], v20, v19
+; GISEL-NEXT:    v_mul_lo_u32 v20, v12, v17
+; GISEL-NEXT:    v_add_i32_e64 v18, s[12:13], v20, v18
+; GISEL-NEXT:    v_mul_hi_u32 v20, v9, v16
+; GISEL-NEXT:    v_add_i32_e64 v19, s[14:15], v19, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v18, s[16:17], v18, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v20, s[6:7], v20, v21
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[14:15]
+; GISEL-NEXT:    v_add_i32_e64 v21, s[6:7], v21, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[16:17]
+; GISEL-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v22
+; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v19
+; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v16
+; GISEL-NEXT:    v_mul_hi_u32 v17, v12, v17
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v21, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v23, v19
+; GISEL-NEXT:    v_mul_lo_u32 v20, v8, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; GISEL-NEXT:    v_mul_hi_u32 v18, v8, v9
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v15, v15, v10
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v16, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v16, v13, v10
+; GISEL-NEXT:    v_addc_u32_e64 v12, vcc, v12, v17, s[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v20
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v19
+; GISEL-NEXT:    v_mul_lo_u32 v13, v13, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v20
+; GISEL-NEXT:    v_mul_hi_u32 v20, v11, v20
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v18
+; GISEL-NEXT:    v_mul_lo_u32 v18, v12, v19
+; GISEL-NEXT:    v_mul_hi_u32 v19, v12, v19
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v8
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v8
+; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v17
+; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v13
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v20
+; GISEL-NEXT:    v_mul_lo_u32 v20, v12, v13
+; GISEL-NEXT:    v_add_i32_e64 v17, s[8:9], v18, v17
+; GISEL-NEXT:    v_mul_hi_u32 v18, v10, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
+; GISEL-NEXT:    v_add_i32_e64 v19, s[10:11], v20, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[10:11]
+; GISEL-NEXT:    v_add_i32_e64 v18, s[8:9], v19, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v19, s[4:5], v20, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v20
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT:    v_add_i32_e64 v17, s[4:5], v18, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v17
+; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v14, v18
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v19
+; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], v0, v7
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v10, v7
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_lo_u32 v18, v3, v10
+; GISEL-NEXT:    v_mul_hi_u32 v19, v2, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v10
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v14
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v15
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v11, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v12, v13, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT:    v_mul_lo_u32 v15, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v20, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v2, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v11
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v13, v9
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v18, v15
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v20, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v19
+; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v10, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v20, v16
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v4
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v16, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v18, v6, v10
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v18
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v13
+; GISEL-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v12
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v2, v6
+; GISEL-NEXT:    v_sub_i32_e64 v12, s[10:11], v0, v4
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[12:13], v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v13, v6
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[14:15], v12, v4
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[16:17], v13, v6
+; GISEL-NEXT:    v_add_i32_e64 v8, s[18:19], v17, v8
+; GISEL-NEXT:    v_add_i32_e64 v11, s[18:19], v19, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v9
+; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v11, v10
+; GISEL-NEXT:    v_subb_u32_e64 v10, s[6:7], v1, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[6:7], v1, v8
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], v8, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11]
+; GISEL-NEXT:    v_subb_u32_e64 v1, vcc, v1, v5, s[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13]
+; GISEL-NEXT:    v_subb_u32_e64 v3, vcc, v3, v7, s[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v15, s[8:9]
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v5
+; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v7
+; GISEL-NEXT:    v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], v14, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v16, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v17, s[8:9]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v5
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[8:9], 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v18, v1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom:
@@ -2401,249 +2401,249 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_urem_v2i64_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
-; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v6
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v3
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v1
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v8
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v6
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
+; GISEL-NEXT:    s_bfe_i32 s6, 1, 0x10000
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v1
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, s6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v0
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v0
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v5
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v9
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
-; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
-; GISEL-NEXT:    v_trunc_f32_e32 v11, v11
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v5
+; GISEL-NEXT:    v_mul_f32_e32 v12, 0x2f800000, v6
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
+; GISEL-NEXT:    v_trunc_f32_e32 v12, v12
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v15, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v19, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v17
-; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v17
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v13
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v14
-; GISEL-NEXT:    v_mul_hi_u32 v17, v11, v17
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v12
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v18
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; GISEL-NEXT:    v_mul_lo_u32 v19, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v6
+; GISEL-NEXT:    v_mul_lo_u32 v19, v11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v20, v10, v6
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v15
+; GISEL-NEXT:    v_mul_hi_u32 v21, v5, v15
+; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v19, v14
+; GISEL-NEXT:    v_mul_lo_u32 v19, v12, v18
+; GISEL-NEXT:    v_mul_hi_u32 v22, v6, v18
+; GISEL-NEXT:    v_mul_hi_u32 v18, v12, v18
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v20
+; GISEL-NEXT:    v_mul_lo_u32 v17, v5, v13
+; GISEL-NEXT:    v_mul_lo_u32 v20, v9, v13
+; GISEL-NEXT:    v_mul_hi_u32 v23, v5, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_mul_lo_u32 v24, v6, v14
+; GISEL-NEXT:    v_mul_lo_u32 v25, v12, v14
+; GISEL-NEXT:    v_mul_hi_u32 v26, v6, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v12, v14
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v20, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v24
+; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v25, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v21
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v23
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v18, v26
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v20, v21
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v24, v19
+; GISEL-NEXT:    v_add_i32_e32 v20, vcc, v25, v22
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_mov_b32_e32 v16, s4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v6
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v8
-; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v12
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v19, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v7, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v18, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v20, v19
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v15, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v14, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v16, v10, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v13
+; GISEL-NEXT:    v_mul_hi_u32 v18, v5, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v19, v12, v14
+; GISEL-NEXT:    v_mul_hi_u32 v20, v6, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v12, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v16
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v7
+; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v21, v12, v8
+; GISEL-NEXT:    v_mul_hi_u32 v22, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v18
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v18, v11, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v19, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v14, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v18, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v18
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v21, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v16
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v21, v18
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v15
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v9, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v5
+; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v15, v2, v7
-; GISEL-NEXT:    v_mul_lo_u32 v17, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v12, v8, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v15, v3, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
+; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v8
+; GISEL-NEXT:    v_mul_lo_u32 v17, 0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, 0, v8
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v17, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v6
+; GISEL-NEXT:    v_mul_lo_u32 v14, 0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v0, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], 0, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], 0, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v11
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], 0, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v16, v7, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v0, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v2, v1
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v10
+; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v12
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], 0, v6, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[6:7], 0, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v4, v8, s[6:7]
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v4, v10, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v6, vcc, 0, v6, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v3, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v2, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v4, v12, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v11, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v13, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v15, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v14, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_24bit:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 53a9c987d8fdb..f142019e2964c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -1358,29 +1358,29 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v16
 ; GFX6-NEXT:    v_min_u32_e32 v16, v3, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v16
+; GFX6-NEXT:    v_min_u32_e32 v16, v4, v20
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v16
+; GFX6-NEXT:    v_min_u32_e32 v16, v5, v21
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v16
+; GFX6-NEXT:    v_min_u32_e32 v16, v6, v22
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v16
+; GFX6-NEXT:    v_min_u32_e32 v16, v7, v23
+; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v16
+; GFX6-NEXT:    v_min_u32_e32 v16, v8, v24
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v16
+; GFX6-NEXT:    v_min_u32_e32 v16, v9, v25
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v16
+; GFX6-NEXT:    v_min_u32_e32 v16, v10, v26
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v16
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX6-NEXT:    v_min_u32_e32 v17, v4, v20
-; GFX6-NEXT:    v_min_u32_e32 v18, v5, v21
-; GFX6-NEXT:    v_min_u32_e32 v19, v6, v22
-; GFX6-NEXT:    v_min_u32_e32 v20, v7, v23
-; GFX6-NEXT:    v_min_u32_e32 v21, v8, v24
-; GFX6-NEXT:    v_min_u32_e32 v22, v9, v25
-; GFX6-NEXT:    v_min_u32_e32 v23, v10, v26
-; GFX6-NEXT:    v_min_u32_e32 v24, v11, v27
-; GFX6-NEXT:    v_min_u32_e32 v25, v12, v28
-; GFX6-NEXT:    v_min_u32_e32 v26, v13, v29
-; GFX6-NEXT:    v_min_u32_e32 v27, v14, v30
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v17
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v18
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v19
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v20
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v21
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v22
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v23
-; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v24
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v25
-; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v26
-; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v27
+; GFX6-NEXT:    v_min_u32_e32 v17, v11, v27
+; GFX6-NEXT:    v_min_u32_e32 v18, v12, v28
+; GFX6-NEXT:    v_min_u32_e32 v19, v13, v29
+; GFX6-NEXT:    v_min_u32_e32 v20, v14, v30
+; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v17
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v18
+; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v19
+; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v20
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v16, v15, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16

diff  --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 3e19ee5567929..a6d8c6f41eee5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE:  NumSgprs: 61
-; TRAP-HANDLER-DISABLE: NumSgprs: 77
+; TRAP-HANDLER-ENABLE:  NumSgprs: 77
+; TRAP-HANDLER-DISABLE: NumSgprs: 92
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     ptr addrspace(1) %out0, i32 %in0,
     ptr addrspace(1) %out1, i32 %in1,

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 3f711da775039..54e8dc2a9d7cd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2426,47 +2426,47 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
 ; GCN-NEXT:    v_add_i32_e32 v19, vcc, 28, v16
 ; GCN-NEXT:    v_add_i32_e32 v20, vcc, 26, v16
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 24, v16
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 22, v16
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 20, v16
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 18, v16
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 16, v16
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 14, v16
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 12, v16
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 10, v16
 ; GCN-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 24, v16
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 22, v16
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 8, v16
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 6, v16
 ; GCN-NEXT:    buffer_store_short v14, v19, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 20, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 18, v16
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v16
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 2, v16
 ; GCN-NEXT:    buffer_store_short v13, v20, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 16, v16
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 14, v16
-; GCN-NEXT:    buffer_store_short v12, v15, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 12, v16
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 10, v16
-; GCN-NEXT:    buffer_store_short v11, v18, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 8, v16
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 6, v16
-; GCN-NEXT:    buffer_store_short v10, v14, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 4, v16
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 2, v16
-; GCN-NEXT:    buffer_store_short v9, v19, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v8, v13, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v12, v21, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v11, v22, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v10, v23, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v9, v24, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v7, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v8, v25, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v12, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v7, v26, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v15, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v6, v27, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, v11, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v5, v28, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v4, v15, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v3, v18, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v10, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v14, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v19, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -2877,10 +2877,10 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x50, v0
 ; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x4c, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    buffer_store_dword v26, v29, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x48, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x44, v0
 ; GCN-NEXT:    buffer_store_dword v25, v31, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
@@ -3098,20 +3098,20 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
 ; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
 ; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
 ; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
 ; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
 ; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
 ; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
@@ -3128,11 +3128,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(25)
-; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(25)
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
 ; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 2635edcb9d8a3..7549805121f06 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -4,13 +4,13 @@
 define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
 ; CHECK-LABEL: spill:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dword s27, s[4:5], 0x2
+; CHECK-NEXT:    s_load_dword s44, s[4:5], 0x2
 ; CHECK-NEXT:    s_mov_b64 s[98:99], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[96:97], s[0:1]
 ; CHECK-NEXT:    s_add_u32 s96, s96, s7
 ; CHECK-NEXT:    s_addc_u32 s97, s97, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_cmp_eq_u32 s27, 0
+; CHECK-NEXT:    s_cmp_eq_u32 s44, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -973,10 +973,10 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v1, s99, 4
 ; CHECK-NEXT:    v_writelane_b32 v0, s93, 62
 ; CHECK-NEXT:    v_writelane_b32 v1, s100, 5
-; CHECK-NEXT:    s_mov_b32 s31, s12
+; CHECK-NEXT:    s_mov_b32 s49, s12
 ; CHECK-NEXT:    v_writelane_b32 v0, s94, 63
 ; CHECK-NEXT:    v_writelane_b32 v1, s101, 6
-; CHECK-NEXT:    s_cmp_eq_u32 s31, 0
+; CHECK-NEXT:    s_cmp_eq_u32 s49, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    ;;#ASMEND

diff  --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
index 1f7be4b691f41..d284813c36843 100644
--- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
@@ -49,29 +49,29 @@ body:             |
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   undef %11.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF3]].sub0, [[DEF5]].sub0, 0, implicit $exec
-  ; CHECK-NEXT:   dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF3]].sub1, [[DEF5]].sub1, %18, 0, implicit $exec
-  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
-  ; CHECK-NEXT:   dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub1
+  ; CHECK-NEXT:   undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec
+  ; CHECK-NEXT:   dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+  ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
+  ; CHECK-NEXT:   dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1
   ; CHECK-NEXT:   dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY %11.sub0
-  ; CHECK-NEXT:   dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF6]], implicit $exec
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF7]], 288, 0, implicit $exec :: (store (s64), addrspace 1)
+  ; CHECK-NEXT:   dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -81,7 +81,7 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[DEF4]].sub1:vreg_64 = COPY [[COPY5]]
+  ; CHECK-NEXT:   undef [[DEF5]].sub1:vreg_64 = COPY [[COPY5]]
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:

diff  --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
index 5ae3473f44374..66276c756db42 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
@@ -24,7 +24,7 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -32,9 +32,10 @@ body:             |
   ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF]], implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
@@ -50,34 +51,33 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_MUL_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF12]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MAC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MAC_F32_e32_]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MOV_B32_e32_1]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   $sgpr4 = IMPLICIT_DEF
-  ; CHECK-NEXT:   $vgpr0 = COPY [[DEF10]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[DEF11]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   $vgpr1 = COPY [[DEF6]]
+  ; CHECK-NEXT:   $vgpr1 = COPY [[DEF7]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[V_MUL_F32_e32_1]]
   ; CHECK-NEXT:   $vgpr1 = COPY [[V_MUL_F32_e32_2]]
   ; CHECK-NEXT:   $vgpr2 = COPY [[V_MUL_F32_e32_3]]
-  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL [[DEF13]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
-  ; CHECK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF7]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MAC_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF11]], [[DEF8]], [[V_MAC_F32_e32_1]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_1]], 0, [[DEF3]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF14]], [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
+  ; CHECK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MAC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_MAC_F32_e32_]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1

diff  --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 3f50130cce480..50842668dc05f 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -11,24 +11,25 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_addc_u32 s25, s25, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
-; CHECK-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 8
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 16
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 24
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; CHECK-NEXT:    s_xor_b64 s[2:3], s[6:7], -1
+; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT:    s_xor_b64 s[4:5], s[8:9], -1
 ; CHECK-NEXT:    s_bitcmp1_b32 s1, 0
-; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s1, 8
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[14:15]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; CHECK-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; CHECK-NEXT:    s_bitcmp1_b32 s1, 8
+; CHECK-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v0
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[10:11]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
-; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
-; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[8:9]
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_branch .LBB0_3
 ; CHECK-NEXT:  .LBB0_1: ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[18:19], 0
@@ -41,17 +42,17 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_12
 ; CHECK-NEXT:  .LBB0_3: ; %bb7
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.4: ; %bb8
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 vcc, s[2:3]
+; CHECK-NEXT:    s_mov_b64 vcc, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb9
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[16:17], 0
 ; CHECK-NEXT:    s_mov_b64 s[18:19], -1
-; CHECK-NEXT:    s_mov_b64 s[22:23], s[8:9]
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[10:11]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_7
 ; CHECK-NEXT:    s_branch .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: ; in Loop: Header=BB0_3 Depth=1
@@ -62,7 +63,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[18:19], -1
 ; CHECK-NEXT:    s_mov_b64 s[16:17], 0
-; CHECK-NEXT:    s_mov_b64 s[22:23], s[12:13]
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[14:15]
 ; CHECK-NEXT:  .LBB0_8: ; %Flow9
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[20:21], -1
@@ -71,12 +72,12 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  ; %bb.9: ; %bb13
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 vcc, s[4:5]
+; CHECK-NEXT:    s_mov_b64 vcc, s[6:7]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_11
 ; CHECK-NEXT:  ; %bb.10: ; %bb16
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[16:17], 0
-; CHECK-NEXT:    s_mov_b64 s[22:23], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[12:13]
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[16:17]
 ; CHECK-NEXT:    s_branch .LBB0_2
 ; CHECK-NEXT:  .LBB0_11: ; in Loop: Header=BB0_3 Depth=1
@@ -86,18 +87,18 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_branch .LBB0_2
 ; CHECK-NEXT:  .LBB0_12: ; %loop.exit.guard6
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_xor_b64 s[14:15], s[20:21], -1
+; CHECK-NEXT:    s_xor_b64 s[22:23], s[20:21], -1
 ; CHECK-NEXT:    s_mov_b64 s[20:21], -1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[22:23]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_16
 ; CHECK-NEXT:  ; %bb.13: ; %bb14
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_15
 ; CHECK-NEXT:  ; %bb.14: ; %bb15
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:4
-; CHECK-NEXT:    buffer_store_dword v1, off, s[24:27], 0
+; CHECK-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4
+; CHECK-NEXT:    buffer_store_dword v0, off, s[24:27], 0
 ; CHECK-NEXT:  .LBB0_15: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[20:21], 0
@@ -112,10 +113,10 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_23
 ; CHECK-NEXT:  ; %bb.19: ; %bb17
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_21
 ; CHECK-NEXT:  ; %bb.20: ; %bb19
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_22
 ; CHECK-NEXT:  .LBB0_21: ; %bb18
 ; CHECK-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 4f3f8c18f5018..f682d352c0ce8 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -888,57 +888,57 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 16
-; CI-NEXT:    v_mov_b32_e32 v5, s3
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v4, s2
+; CI-NEXT:    v_mov_b32_e32 v5, s3
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_mov_b32_e32 v4, s2
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; CI-NEXT:    s_add_u32 s2, s0, 16
 ; CI-NEXT:    s_addc_u32 s3, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v14, s3
-; CI-NEXT:    v_mov_b32_e32 v13, s2
-; CI-NEXT:    s_add_u32 s2, s0, 48
-; CI-NEXT:    s_addc_u32 s3, s1, 0
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f32_f16_e32 v8, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v3
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v11, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v6
-; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v6
-; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
-; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; CI-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v18, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v6
+; CI-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
+; CI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
+; CI-NEXT:    v_mov_b32_e32 v7, s3
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
+; CI-NEXT:    v_mov_b32_e32 v6, s2
+; CI-NEXT:    s_add_u32 s2, s0, 48
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v4
+; CI-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v13, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v16
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v17
+; CI-NEXT:    s_addc_u32 s3, s1, 0
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v17
+; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT:    v_cvt_f32_f16_e32 v17, v25
 ; CI-NEXT:    v_mov_b32_e32 v4, s0
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v1
 ; CI-NEXT:    s_add_u32 s0, s0, 32
-; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v24
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT:    v_mov_b32_e32 v15, s3
-; CI-NEXT:    v_mov_b32_e32 v17, s1
-; CI-NEXT:    v_mov_b32_e32 v14, s2
-; CI-NEXT:    v_mov_b32_e32 v16, s0
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_mov_b32_e32 v21, s3
+; CI-NEXT:    v_mov_b32_e32 v23, s1
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_mov_b32_e32 v20, s2
+; CI-NEXT:    v_mov_b32_e32 v22, s0
+; CI-NEXT:    flat_store_dwordx4 v[6:7], v[16:19]
 ; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; CI-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; CI-NEXT:    flat_store_dwordx4 v[16:17], v[6:9]
+; CI-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
+; CI-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_extload_v16f16_to_v16f32:
@@ -947,24 +947,26 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    s_add_u32 s2, s2, 16
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; VI-NEXT:    s_add_u32 s2, s0, 16
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v19, s3
-; VI-NEXT:    v_mov_b32_e32 v18, s2
+; VI-NEXT:    v_mov_b32_e32 v23, s3
+; VI-NEXT:    v_mov_b32_e32 v22, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 48
-; VI-NEXT:    v_mov_b32_e32 v17, s1
+; VI-NEXT:    v_mov_b32_e32 v21, s1
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v16, s0
+; VI-NEXT:    v_mov_b32_e32 v20, s0
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v21, s3
-; VI-NEXT:    v_mov_b32_e32 v20, s2
+; VI-NEXT:    v_mov_b32_e32 v25, s3
+; VI-NEXT:    v_mov_b32_e32 v27, s1
+; VI-NEXT:    v_mov_b32_e32 v24, s2
+; VI-NEXT:    v_mov_b32_e32 v26, s0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v14, v3
 ; VI-NEXT:    v_cvt_f32_f16_e32 v12, v2
@@ -974,21 +976,19 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; VI-NEXT:    v_cvt_f32_f16_e32 v8, v0
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e32 v18, v7
+; VI-NEXT:    v_cvt_f32_f16_e32 v16, v6
+; VI-NEXT:    v_cvt_f32_f16_sdwa v19, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; VI-NEXT:    v_cvt_f32_f16_e32 v14, v7
-; VI-NEXT:    v_cvt_f32_f16_e32 v12, v6
-; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; VI-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
-; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    flat_store_dwordx4 v[22:23], v[12:15]
+; VI-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
+; VI-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
+; VI-NEXT:    flat_store_dwordx4 v[26:27], v[0:3]
 ; VI-NEXT:    s_endpgm
   %val = load <16 x half>, ptr addrspace(1) %in
   %cvt = fpext <16 x half> %val to <16 x float>
@@ -1183,43 +1183,43 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    s_add_u32 s2, s0, 48
 ; CI-NEXT:    s_addc_u32 s3, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v7, s3
-; CI-NEXT:    v_mov_b32_e32 v6, s2
+; CI-NEXT:    v_mov_b32_e32 v19, s3
+; CI-NEXT:    v_mov_b32_e32 v18, s2
 ; CI-NEXT:    s_add_u32 s2, s0, 32
-; CI-NEXT:    v_mov_b32_e32 v13, s1
+; CI-NEXT:    v_mov_b32_e32 v17, s1
 ; CI-NEXT:    s_addc_u32 s3, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v12, s0
+; CI-NEXT:    v_mov_b32_e32 v16, s0
 ; CI-NEXT:    s_add_u32 s0, s0, 16
-; CI-NEXT:    v_mov_b32_e32 v15, s3
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v14, s2
+; CI-NEXT:    v_mov_b32_e32 v21, s3
+; CI-NEXT:    v_mov_b32_e32 v23, s1
+; CI-NEXT:    v_mov_b32_e32 v20, s2
+; CI-NEXT:    v_mov_b32_e32 v22, s0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v8, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v4
-; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v1
-; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v16, v5
-; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v17, v9
-; CI-NEXT:    v_cvt_f32_f16_e32 v18, v11
-; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; CI-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
-; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
-; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v16
-; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
-; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v18
-; CI-NEXT:    v_mov_b32_e32 v17, s1
-; CI-NEXT:    v_mov_b32_e32 v16, s0
-; CI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v1
+; CI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v3
+; CI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v10
+; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
+; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v24
+; CI-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
+; CI-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
+; CI-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
 ; CI-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
-; CI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_extload_v8f16_to_v8f64:
@@ -1231,39 +1231,39 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    s_add_u32 s2, s0, 48
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v8, s3
-; VI-NEXT:    v_mov_b32_e32 v7, s2
+; VI-NEXT:    v_mov_b32_e32 v19, s3
+; VI-NEXT:    v_mov_b32_e32 v18, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 32
-; VI-NEXT:    v_mov_b32_e32 v13, s1
+; VI-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v12, s0
+; VI-NEXT:    v_mov_b32_e32 v16, s0
 ; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    v_mov_b32_e32 v15, s3
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v14, s2
+; VI-NEXT:    v_mov_b32_e32 v21, s3
+; VI-NEXT:    v_mov_b32_e32 v23, s1
+; VI-NEXT:    v_mov_b32_e32 v20, s2
+; VI-NEXT:    v_mov_b32_e32 v22, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
-; VI-NEXT:    v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
-; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
-; VI-NEXT:    v_cvt_f32_f16_e32 v11, v2
-; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
+; VI-NEXT:    v_cvt_f32_f16_e32 v10, v3
+; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
-; VI-NEXT:    flat_store_dwordx4 v[7:8], v[3:6]
-; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v11
-; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; VI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; VI-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v10
+; VI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v3
+; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v7
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
-; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v17
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v16
-; VI-NEXT:    v_mov_b32_e32 v17, s1
-; VI-NEXT:    v_mov_b32_e32 v16, s0
-; VI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; VI-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
-; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v24
+; VI-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
+; VI-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
+; VI-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
+; VI-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
 ; VI-NEXT:    s_endpgm
   %val = load <8 x half>, ptr addrspace(1) %in
   %cvt = fpext <8 x half> %val to <8 x double>
@@ -1304,28 +1304,26 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v10, v8
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v21, v5
 ; CI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; CI-NEXT:    v_mov_b32_e32 v15, s3
+; CI-NEXT:    s_nop 0
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; CI-NEXT:    v_mov_b32_e32 v14, s2
+; CI-NEXT:    v_mov_b32_e32 v15, s3
 ; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v9, v0
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    s_waitcnt vmcnt(2)
 ; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; CI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
@@ -1333,16 +1331,19 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
 ; CI-NEXT:    v_cvt_f32_f16_e32 v8, v10
+; CI-NEXT:    v_mov_b32_e32 v14, s2
 ; CI-NEXT:    s_add_u32 s2, s0, 0x60
 ; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
 ; CI-NEXT:    v_cvt_f32_f16_e32 v10, v11
 ; CI-NEXT:    s_addc_u32 s3, s1, 0
+; CI-NEXT:    v_cvt_f32_f16_e32 v19, v5
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; CI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; CI-NEXT:    v_mov_b32_e32 v17, s3
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v20
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v18
 ; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v12, v5
 ; CI-NEXT:    v_mov_b32_e32 v16, s2
@@ -1353,16 +1354,16 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; CI-NEXT:    s_add_u32 s0, s0, 64
 ; CI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
+; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v19
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
-; CI-NEXT:    v_mov_b32_e32 v19, s3
+; CI-NEXT:    v_mov_b32_e32 v21, s3
 ; CI-NEXT:    v_mov_b32_e32 v13, s1
-; CI-NEXT:    v_mov_b32_e32 v18, s2
+; CI-NEXT:    v_mov_b32_e32 v20, s2
 ; CI-NEXT:    v_mov_b32_e32 v12, s0
 ; CI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; CI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; CI-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
 ; CI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
 ; CI-NEXT:    s_endpgm
 ;
@@ -1372,85 +1373,84 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    s_add_u32 s2, s2, 16
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; VI-NEXT:    s_add_u32 s2, s0, 48
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v14, s3
-; VI-NEXT:    v_mov_b32_e32 v13, s2
+; VI-NEXT:    v_mov_b32_e32 v9, s3
+; VI-NEXT:    v_mov_b32_e32 v8, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 32
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v16, s3
-; VI-NEXT:    v_mov_b32_e32 v15, s2
+; VI-NEXT:    v_mov_b32_e32 v13, s3
+; VI-NEXT:    v_mov_b32_e32 v12, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 16
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v18, s3
-; VI-NEXT:    v_mov_b32_e32 v17, s2
+; VI-NEXT:    v_mov_b32_e32 v15, s3
+; VI-NEXT:    v_mov_b32_e32 v14, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 0x50
-; VI-NEXT:    v_mov_b32_e32 v12, s1
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v11, s0
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
-; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
-; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
-; VI-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
-; VI-NEXT:    s_nop 0
-; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_f16_e32 v10, v2
-; VI-NEXT:    v_mov_b32_e32 v14, s3
-; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; VI-NEXT:    v_mov_b32_e32 v13, s2
+; VI-NEXT:    v_mov_b32_e32 v17, s3
+; VI-NEXT:    v_mov_b32_e32 v16, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 64
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
-; VI-NEXT:    v_mov_b32_e32 v16, s3
-; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, v4
-; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; VI-NEXT:    v_mov_b32_e32 v15, s2
+; VI-NEXT:    v_mov_b32_e32 v19, s3
+; VI-NEXT:    v_mov_b32_e32 v11, s1
+; VI-NEXT:    v_mov_b32_e32 v18, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 0x70
+; VI-NEXT:    v_mov_b32_e32 v10, s0
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    flat_store_dwordx4 v[17:18], v[4:7]
-; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
-; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v9
-; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; VI-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
-; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
-; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v9
-; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[1:2], v2
-; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v10
-; VI-NEXT:    v_cvt_f64_f32_e32 v[11:12], v11
-; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
 ; VI-NEXT:    s_add_u32 s0, s0, 0x60
-; VI-NEXT:    flat_store_dwordx4 v[13:14], v[1:4]
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
-; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
-; VI-NEXT:    v_mov_b32_e32 v20, s3
-; VI-NEXT:    v_mov_b32_e32 v14, s1
-; VI-NEXT:    v_mov_b32_e32 v19, s2
-; VI-NEXT:    v_mov_b32_e32 v13, s0
-; VI-NEXT:    flat_store_dwordx4 v[15:16], v[9:12]
-; VI-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
-; VI-NEXT:    flat_store_dwordx4 v[13:14], v[5:8]
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_cvt_f32_f16_e32 v22, v0
+; VI-NEXT:    v_cvt_f32_f16_sdwa v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v24, v1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v20, v2
+; VI-NEXT:    v_cvt_f32_f16_sdwa v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e32 v31, v5
+; VI-NEXT:    v_cvt_f32_f16_sdwa v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v26, v6
+; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; VI-NEXT:    v_cvt_f32_f16_sdwa v27, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v20
+; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v21
+; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; VI-NEXT:    v_cvt_f32_f16_sdwa v28, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v29, v4
+; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; VI-NEXT:    v_cvt_f32_f16_sdwa v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v24
+; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v25
+; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v22
+; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v23
+; VI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v31
+; VI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
+; VI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v32
+; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v29
+; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v30
+; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v28
+; VI-NEXT:    v_mov_b32_e32 v21, s3
+; VI-NEXT:    v_mov_b32_e32 v23, s1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v26
+; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v27
+; VI-NEXT:    v_mov_b32_e32 v20, s2
+; VI-NEXT:    v_mov_b32_e32 v22, s0
+; VI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
+; VI-NEXT:    flat_store_dwordx4 v[18:19], v[4:7]
+; VI-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
+; VI-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
 ; VI-NEXT:    s_endpgm
   %val = load <16 x half>, ptr addrspace(1) %in
   %cvt = fpext <16 x half> %val to <16 x double>
@@ -1706,51 +1706,52 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v13, s3
-; CI-NEXT:    v_mov_b32_e32 v12, s2
 ; CI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; CI-NEXT:    v_mov_b32_e32 v12, s2
 ; CI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; CI-NEXT:    s_add_u32 s2, s0, 16
 ; CI-NEXT:    s_addc_u32 s3, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v17, s3
+; CI-NEXT:    v_mov_b32_e32 v16, s2
 ; CI-NEXT:    s_waitcnt vmcnt(3)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    s_waitcnt vmcnt(2)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v17, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_mov_b32_e32 v5, s3
 ; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v1, v2, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v16
-; CI-NEXT:    v_mov_b32_e32 v4, s2
+; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v18
 ; CI-NEXT:    v_or_b32_e32 v3, v6, v2
-; CI-NEXT:    v_or_b32_e32 v2, v17, v7
-; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
-; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v15
-; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
-; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; CI-NEXT:    v_or_b32_e32 v2, v4, v5
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
+; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
+; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
+; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v13
+; CI-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; CI-NEXT:    s_nop 0
+; CI-NEXT:    v_or_b32_e32 v1, v10, v4
+; CI-NEXT:    v_or_b32_e32 v0, v8, v5
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_or_b32_e32 v1, v10, v6
-; CI-NEXT:    v_or_b32_e32 v0, v8, v7
-; CI-NEXT:    v_or_b32_e32 v3, v14, v9
-; CI-NEXT:    v_or_b32_e32 v2, v12, v11
+; CI-NEXT:    v_or_b32_e32 v3, v14, v6
+; CI-NEXT:    v_or_b32_e32 v2, v12, v7
 ; CI-NEXT:    v_mov_b32_e32 v4, s0
 ; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; CI-NEXT:    s_endpgm
@@ -1766,29 +1767,31 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; VI-NEXT:    s_add_u32 s4, s2, 48
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v9, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v8, s2
 ; VI-NEXT:    s_add_u32 s2, s2, 16
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v13, s3
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v13, s3
 ; VI-NEXT:    v_mov_b32_e32 v12, s2
 ; VI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; VI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; VI-NEXT:    s_add_u32 s2, s0, 16
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v17, s3
+; VI-NEXT:    v_mov_b32_e32 v16, s2
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; VI-NEXT:    v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT:    v_cvt_f16_f32_sdwa v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; VI-NEXT:    v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-NEXT:    v_cvt_f16_f32_e32 v18, v4
+; VI-NEXT:    v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v10, v10
@@ -1799,19 +1802,17 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; VI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v3
-; VI-NEXT:    v_or_b32_e32 v0, v0, v16
+; VI-NEXT:    v_or_b32_e32 v0, v0, v18
 ; VI-NEXT:    v_or_b32_e32 v3, v6, v7
-; VI-NEXT:    v_or_b32_e32 v2, v18, v17
-; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    v_or_b32_e32 v2, v4, v5
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_or_b32_e32 v1, v10, v11
 ; VI-NEXT:    v_or_b32_e32 v0, v8, v9
 ; VI-NEXT:    v_or_b32_e32 v3, v14, v15
 ; VI-NEXT:    v_or_b32_e32 v2, v12, v13
-; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
   %val = load <16 x float>, ptr addrspace(1) %in

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 55a57ef67a3e3..9f0055143e713 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -2931,8 +2931,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v14
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
-; GFX8-NEXT:    v_mul_lo_u16_e32 v10, v10, v15
-; GFX8-NEXT:    v_mul_lo_u16_e32 v15, v16, v18
+; GFX8-NEXT:    v_mul_lo_u16_e32 v20, v16, v18
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
@@ -2940,7 +2939,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v14, v17, v19
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v9, v11
-; GFX8-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_lo_u16_e32 v10, v10, v15
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 03e1960ca7c6a..5a002364ab23b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -1061,585 +1061,560 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_lshr_b32 s42, s7, 22
 ; GCN-NEXT:    s_lshr_b32 s43, s7, 23
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x77
-; GCN-NEXT:    v_mov_b32_e32 v14, s43
+; GCN-NEXT:    v_mov_b32_e32 v15, s43
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x76
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s42
+; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
+; GCN-NEXT:    v_mov_b32_e32 v18, s42
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x75
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v17
-; GCN-NEXT:    v_mov_b32_e32 v17, s41
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v18
+; GCN-NEXT:    v_mov_b32_e32 v18, s41
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x74
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s40
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    v_mov_b32_e32 v19, s40
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x73
-; GCN-NEXT:    v_or_b32_e32 v14, v17, v14
-; GCN-NEXT:    v_mov_b32_e32 v17, s39
+; GCN-NEXT:    v_or_b32_e32 v15, v18, v15
+; GCN-NEXT:    v_mov_b32_e32 v18, s39
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x72
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s38
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_mov_b32_e32 v19, s38
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x71
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_mov_b32_e32 v18, s37
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
+; GCN-NEXT:    v_mov_b32_e32 v19, s37
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x70
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s36
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_mov_b32_e32 v20, s36
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
-; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
+; GCN-NEXT:    v_and_b32_e32 v18, 15, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7f
-; GCN-NEXT:    v_or_b32_e32 v14, v17, v14
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s35
+; GCN-NEXT:    v_or_b32_e32 v15, v18, v15
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s35
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7e
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s35
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7d
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s35
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s35
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7c
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s35
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7b
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s35
+; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s35
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7a
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s35
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x78
-; GCN-NEXT:    v_mov_b32_e32 v12, s35
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_mov_b32_e32 v13, s35
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x79
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
+; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s35
+; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v12, v12, v19
-; GCN-NEXT:    v_and_b32_e32 v12, 3, v12
-; GCN-NEXT:    v_or_b32_e32 v18, v12, v18
-; GCN-NEXT:    v_mov_b32_e32 v12, 15
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
-; GCN-NEXT:    v_and_b32_sdwa v18, v18, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v13, v13, v20
+; GCN-NEXT:    v_and_b32_e32 v13, 3, v13
+; GCN-NEXT:    v_or_b32_e32 v19, v13, v19
+; GCN-NEXT:    v_mov_b32_e32 v13, 15
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
+; GCN-NEXT:    v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6f
-; GCN-NEXT:    v_or_b32_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 15, s7
+; GCN-NEXT:    v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 15, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6e
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 14, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 14, s7
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6d
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 13, s7
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 13, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6c
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 12, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 12, s7
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6b
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s7
+; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 11, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6a
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 10, s7
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 10, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x69
+; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 9, s7
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x68
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 8, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v20
+; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
+; GCN-NEXT:    v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x67
+; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s7
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x66
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s7
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
 ; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x69
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x65
 ; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 9, s7
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x68
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 8, s7
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x64
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s7
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
 ; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x63
+; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s7
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x62
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x61
+; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s7
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x60
+; GCN-NEXT:    v_mov_b32_e32 v16, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v19
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v20
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 4, v18
+; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
 ; GCN-NEXT:    v_or_b32_e32 v16, v16, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
-; GCN-NEXT:    v_and_b32_sdwa v16, v16, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x67
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s7
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x57
+; GCN-NEXT:    v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_mov_b32_e32 v17, s34
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x66
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s7
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x56
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_mov_b32_e32 v18, s33
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x65
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x55
 ; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s7
+; GCN-NEXT:    v_mov_b32_e32 v18, s31
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x64
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s7
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x54
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_mov_b32_e32 v19, s30
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x63
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x53
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s7
+; GCN-NEXT:    v_mov_b32_e32 v18, s29
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x62
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s7
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x52
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_mov_b32_e32 v19, s28
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
 ; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x61
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x51
 ; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s7
+; GCN-NEXT:    v_mov_b32_e32 v19, s27
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x60
-; GCN-NEXT:    v_mov_b32_e32 v15, s7
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x50
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_mov_b32_e32 v20, s26
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
 ; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v19
-; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v18
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
+; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
-; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v17
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x57
-; GCN-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v16, s34
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x56
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s33
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x55
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_mov_b32_e32 v17, s31
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x54
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s30
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_and_b32_e32 v18, 15, v18
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5f
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x53
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_mov_b32_e32 v17, s29
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x52
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s28
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s25
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5e
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s25
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x51
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_mov_b32_e32 v18, s27
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x50
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s26
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5f
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s25
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5e
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5d
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s25
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s25
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5c
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s25
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5b
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s25
+; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s25
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5a
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s25
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x58
 ; GCN-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x59
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s25
+; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s25
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
 ; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v20
 ; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
-; GCN-NEXT:    v_and_b32_sdwa v3, v3, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
+; GCN-NEXT:    v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4f
-; GCN-NEXT:    v_or_b32_sdwa v16, v16, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_lshrrev_b16_e64 v3, 15, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4e
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 14, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4d
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 13, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4c
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 12, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4b
-; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s6
+; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4a
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 10, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x49
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s6
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 9, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x48
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 8, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 8, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
 ; GCN-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
-; GCN-NEXT:    v_and_b32_sdwa v17, v17, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x47
-; GCN-NEXT:    v_or_b32_e32 v17, v3, v17
+; GCN-NEXT:    v_or_b32_e32 v18, v3, v18
 ; GCN-NEXT:    v_lshrrev_b16_e64 v3, 7, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x46
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x45
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x44
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x43
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v3
+; GCN-NEXT:    v_or_b32_e32 v19, v19, v3
 ; GCN-NEXT:    v_lshrrev_b16_e64 v3, 3, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x42
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x41
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v20
+; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 64
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v19
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v20
 ; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
-; GCN-NEXT:    v_or_b32_sdwa v3, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v18
+; GCN-NEXT:    v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v19
 ; GCN-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 55
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v14, s24
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v15
+; GCN-NEXT:    v_mov_b32_e32 v15, s24
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 54
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    v_mov_b32_e32 v15, s23
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
-; GCN-NEXT:    s_cmp_lg_u32 s0, 53
-; GCN-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v15
-; GCN-NEXT:    v_mov_b32_e32 v15, s22
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 52
-; GCN-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v16, s21
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
-; GCN-NEXT:    s_cmp_lg_u32 s0, 51
-; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
-; GCN-NEXT:    v_mov_b32_e32 v15, s20
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 50
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v16, s19
+; GCN-NEXT:    v_mov_b32_e32 v16, s23
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
-; GCN-NEXT:    s_cmp_lg_u32 s0, 49
+; GCN-NEXT:    s_cmp_lg_u32 s0, 53
+; GCN-NEXT:    v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_mov_b32_e32 v16, s18
+; GCN-NEXT:    v_mov_b32_e32 v16, s22
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 48
+; GCN-NEXT:    s_cmp_lg_u32 s0, 52
+; GCN-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
+; GCN-NEXT:    v_mov_b32_e32 v17, s21
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    s_cmp_lg_u32 s0, 51
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
-; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
-; GCN-NEXT:    s_cmp_lg_u32 s0, 63
-; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
-; GCN-NEXT:    v_lshrrev_b16_e64 v15, 7, s16
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 62
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
+; GCN-NEXT:    v_mov_b32_e32 v16, s20
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 50
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
-; GCN-NEXT:    s_cmp_lg_u32 s0, 61
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s16
+; GCN-NEXT:    v_mov_b32_e32 v17, s19
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 60
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 4, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    s_cmp_lg_u32 s0, 49
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
+; GCN-NEXT:    v_mov_b32_e32 v17, s18
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 48
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_mov_b32_e32 v18, s17
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
-; GCN-NEXT:    s_cmp_lg_u32 s0, 59
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
+; GCN-NEXT:    s_cmp_lg_u32 s0, 63
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 3, s16
+; GCN-NEXT:    v_lshrrev_b16_e64 v16, 7, s16
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 58
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 2, s16
+; GCN-NEXT:    s_cmp_lg_u32 s0, 62
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s16
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    s_cmp_lg_u32 s0, 56
-; GCN-NEXT:    v_mov_b32_e32 v13, s16
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 57
+; GCN-NEXT:    s_cmp_lg_u32 s0, 61
 ; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 1, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s16
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 60
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s16
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v13, v13, v17
-; GCN-NEXT:    v_and_b32_e32 v13, 3, v13
-; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
-; GCN-NEXT:    v_and_b32_sdwa v13, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
-; GCN-NEXT:    s_cmp_lg_u32 s0, 47
-; GCN-NEXT:    v_or_b32_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v13, 15, s5
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    s_cmp_lg_u32 s0, 59
+; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 3, s16
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 46
-; GCN-NEXT:    v_lshrrev_b16_e64 v15, 14, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
+; GCN-NEXT:    s_cmp_lg_u32 s0, 58
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s16
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 3, v13
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
-; GCN-NEXT:    s_cmp_lg_u32 s0, 45
-; GCN-NEXT:    v_or_b32_e32 v13, v13, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v15, 13, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    s_cmp_lg_u32 s0, 56
+; GCN-NEXT:    v_mov_b32_e32 v14, s16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 44
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 12, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
+; GCN-NEXT:    s_cmp_lg_u32 s0, 57
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 1, s16
+; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
-; GCN-NEXT:    s_cmp_lg_u32 s0, 43
-; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
-; GCN-NEXT:    v_lshrrev_b16_e64 v15, 11, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v18
+; GCN-NEXT:    v_and_b32_e32 v14, 3, v14
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
+; GCN-NEXT:    v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v14, v16, v14
+; GCN-NEXT:    s_cmp_lg_u32 s0, 47
+; GCN-NEXT:    v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_lshrrev_b16_e64 v14, 15, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 42
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 10, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
+; GCN-NEXT:    s_cmp_lg_u32 s0, 46
+; GCN-NEXT:    v_lshrrev_b16_e64 v16, 14, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
-; GCN-NEXT:    s_cmp_lg_u32 s0, 41
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 9, s5
+; GCN-NEXT:    s_cmp_lg_u32 s0, 45
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v16
+; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 40
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 8, s5
+; GCN-NEXT:    s_cmp_lg_u32 s0, 44
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 12, s5
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
@@ -1647,211 +1622,236 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
-; GCN-NEXT:    v_and_b32_sdwa v15, v15, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    s_cmp_lg_u32 s0, 43
+; GCN-NEXT:    v_or_b32_e32 v14, v16, v14
+; GCN-NEXT:    v_lshrrev_b16_e64 v16, 11, s5
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 42
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 10, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    s_cmp_lg_u32 s0, 41
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 9, s5
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 40
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 8, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
+; GCN-NEXT:    v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 39
-; GCN-NEXT:    v_or_b32_e32 v15, v13, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v13, 7, s5
+; GCN-NEXT:    v_or_b32_e32 v16, v14, v16
+; GCN-NEXT:    v_lshrrev_b16_e64 v14, 7, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 38
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 3, v13
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 37
-; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s5
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v17
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 36
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 4, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s5
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 35
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v13
-; GCN-NEXT:    v_lshrrev_b16_e64 v13, 3, s5
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v14
+; GCN-NEXT:    v_lshrrev_b16_e64 v14, 3, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 34
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 2, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 3, v13
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 33
-; GCN-NEXT:    v_or_b32_e32 v17, v13, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v13, 1, s5
+; GCN-NEXT:    v_or_b32_e32 v18, v14, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v14, 1, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 32
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v13
+; GCN-NEXT:    v_or_b32_e32 v1, v1, v14
 ; GCN-NEXT:    v_and_b32_e32 v1, 3, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
+; GCN-NEXT:    v_or_b32_e32 v1, v1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
 ; GCN-NEXT:    v_and_b32_e32 v1, 15, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v16
-; GCN-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v1, v1, v17
+; GCN-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 23
-; GCN-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v14, s15
+; GCN-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    v_mov_b32_e32 v15, s15
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 22
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    v_mov_b32_e32 v15, s14
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
-; GCN-NEXT:    s_cmp_lg_u32 s0, 21
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v15
-; GCN-NEXT:    v_mov_b32_e32 v15, s13
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 20
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
-; GCN-NEXT:    s_cmp_lg_u32 s0, 19
-; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
-; GCN-NEXT:    v_mov_b32_e32 v15, s11
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 18
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v16, s10
+; GCN-NEXT:    v_mov_b32_e32 v16, s14
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
-; GCN-NEXT:    s_cmp_lg_u32 s0, 17
+; GCN-NEXT:    s_cmp_lg_u32 s0, 21
 ; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_mov_b32_e32 v16, s9
+; GCN-NEXT:    v_mov_b32_e32 v16, s13
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 16
+; GCN-NEXT:    s_cmp_lg_u32 s0, 20
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s8
+; GCN-NEXT:    v_mov_b32_e32 v17, s12
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    s_cmp_lg_u32 s0, 19
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
-; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
+; GCN-NEXT:    v_mov_b32_e32 v16, s11
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 18
+; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_mov_b32_e32 v17, s10
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    s_cmp_lg_u32 s0, 17
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
+; GCN-NEXT:    v_mov_b32_e32 v17, s9
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 16
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_mov_b32_e32 v19, s8
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_or_b32_e32 v17, v19, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 31
-; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
-; GCN-NEXT:    v_lshrrev_b16_e64 v15, 7, s1
+; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
+; GCN-NEXT:    v_lshrrev_b16_e64 v16, 7, s1
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 30
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s1
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 29
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s1
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s1
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 28
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_or_b32_e32 v17, v19, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 27
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 3, s1
+; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 3, s1
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 26
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 24
-; GCN-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_mov_b32_e32 v18, s1
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 25
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 1, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s1
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
-; GCN-NEXT:    v_and_b32_sdwa v16, v16, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
+; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
+; GCN-NEXT:    v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 15
-; GCN-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v15, 15, s4
+; GCN-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_lshrrev_b16_e64 v16, 15, s4
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 14
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 14, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 13
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s4
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s4
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 12
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 12, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 11
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s4
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 11, s4
+; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 10
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s4
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v17, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v14, 10, s4
+; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v19, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 9
-; GCN-NEXT:    v_lshrrev_b16_e64 v13, 9, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v18, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v12, 9, s4
+; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 8
 ; GCN-NEXT:    v_lshrrev_b16_e64 v11, 8, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 7
 ; GCN-NEXT:    v_lshrrev_b16_e64 v10, 7, s4
@@ -1886,8 +1886,8 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v12, 1, v12
 ; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
 ; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GCN-NEXT:    v_lshlrev_b16_e32 v8, 1, v8
@@ -1895,33 +1895,33 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GCN-NEXT:    v_lshlrev_b16_e32 v4, 1, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v13
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
+; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
 ; GCN-NEXT:    v_lshlrev_b16_e32 v10, 3, v10
 ; GCN-NEXT:    v_lshlrev_b16_e32 v9, 2, v9
 ; GCN-NEXT:    v_or_b32_e32 v7, v7, v8
 ; GCN-NEXT:    v_lshlrev_b16_e32 v6, 3, v6
 ; GCN-NEXT:    v_lshlrev_b16_e32 v5, 2, v5
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v4
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
+; GCN-NEXT:    v_or_b32_e32 v14, v17, v14
 ; GCN-NEXT:    v_and_b32_e32 v11, 3, v11
 ; GCN-NEXT:    v_or_b32_e32 v9, v10, v9
 ; GCN-NEXT:    v_and_b32_e32 v7, 3, v7
 ; GCN-NEXT:    v_or_b32_e32 v5, v6, v5
 ; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v16
+; GCN-NEXT:    v_or_b32_e32 v11, v11, v14
 ; GCN-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v5
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
-; GCN-NEXT:    v_and_b32_sdwa v11, v11, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
+; GCN-NEXT:    v_and_b32_sdwa v11, v11, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    v_lshlrev_b16_e32 v7, 4, v7
 ; GCN-NEXT:    v_and_b32_e32 v0, 15, v0
-; GCN-NEXT:    v_or_b32_e32 v11, v15, v11
+; GCN-NEXT:    v_or_b32_e32 v11, v16, v11
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GCN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 95d0b18f9b15b..6bcebdad0eb79 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2801,14 +2801,16 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v12, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v10, v2, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v12, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 14
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v13, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v10, v1, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v13, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 15
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2816,32 +2818,30 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 12
-; GFX9-NEXT:    v_perm_b32 v0, v12, v0, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v14, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v10, v0, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v14, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 10
-; GFX9-NEXT:    v_perm_b32 v7, v12, v7, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v15, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v7, v10, v7, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v15, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 11
-; GFX9-NEXT:    v_perm_b32 v2, v10, v2, s2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 8
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v6, v10, v6, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v16, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 9
-; GFX9-NEXT:    v_perm_b32 v1, v11, v1, s2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX9-NEXT:    v_perm_b32 v6, v12, v6, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
 ; GFX9-NEXT:    v_perm_b32 v5, v10, v5, s2
 ; GFX9-NEXT:    v_perm_b32 v4, v9, v4, s2
 ; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
@@ -2851,7 +2851,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; VI-LABEL: v_insertelement_v16f16_dynamic:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -2865,81 +2865,81 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
-; VI-NEXT:    s_cmp_eq_u32 s7, 14
+; VI-NEXT:    s_cmp_eq_u32 s5, 14
 ; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
-; VI-NEXT:    v_mov_b32_e32 v12, s6
+; VI-NEXT:    v_mov_b32_e32 v12, s4
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 15
+; VI-NEXT:    s_cmp_eq_u32 s5, 15
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cndmask_b32_e32 v13, v3, v12, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s5, 12
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 12
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 13
+; VI-NEXT:    s_cmp_eq_u32 s5, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v12, s[0:1]
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 10
-; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 11
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s5, 10
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v12, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s5, 11
 ; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
-; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 8
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v15, v15, v12, s[2:3]
-; VI-NEXT:    s_cmp_eq_u32 s7, 9
+; VI-NEXT:    v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s5, 8
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v12, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s5, 9
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 6
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v12, vcc
+; VI-NEXT:    s_cmp_eq_u32 s5, 6
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 7
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[4:5]
+; VI-NEXT:    s_cmp_eq_u32 s5, 7
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_cndmask_b32_e64 v14, v14, v12, s[0:1]
-; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 4
-; VI-NEXT:    v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_cndmask_b32_e32 v15, v17, v12, vcc
+; VI-NEXT:    s_cmp_eq_u32 s5, 4
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 5
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; VI-NEXT:    s_cmp_eq_u32 s5, 5
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 2
-; VI-NEXT:    v_cndmask_b32_e32 v13, v13, v12, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 3
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
+; VI-NEXT:    s_cmp_eq_u32 s5, 2
+; VI-NEXT:    v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v13, v18, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 0
+; VI-NEXT:    s_cmp_eq_u32 s5, 3
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v12, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 1
+; VI-NEXT:    s_cmp_eq_u32 s5, 0
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
+; VI-NEXT:    v_cndmask_b32_e32 v13, v19, v12, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s5, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; VI-NEXT:    v_cndmask_b32_e32 v12, v20, v12, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; VI-NEXT:    v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; VI-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
@@ -2972,101 +2972,101 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; CI-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[0:1]
 ; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 11
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
 ; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
-; CI-NEXT:    v_cndmask_b32_e64 v12, v12, v10, s[2:3]
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 10
-; CI-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v10, vcc
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
-; CI-NEXT:    v_or_b32_e32 v2, v2, v12
-; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; CI-NEXT:    v_or_b32_e32 v1, v1, v12
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v13, v15
 ; CI-NEXT:    s_cmp_eq_u32 s5, 9
 ; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 8
-; CI-NEXT:    v_cvt_f32_f16_e32 v14, v16
-; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v10, vcc
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cndmask_b32_e32 v14, v14, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 7
+; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 6
-; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v10, vcc
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_cndmask_b32_e32 v15, v15, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 5
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 4
-; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT:    v_cndmask_b32_e32 v14, v14, v10, vcc
+; CI-NEXT:    v_or_b32_e32 v3, v3, v11
+; CI-NEXT:    v_cndmask_b32_e32 v11, v16, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v12, v12, v10, s[2:3]
 ; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_or_b32_e32 v6, v6, v11
+; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
 ; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT:    v_or_b32_e32 v3, v3, v11
-; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; CI-NEXT:    v_or_b32_e32 v0, v0, v12
-; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; CI-NEXT:    s_cmp_eq_u32 s5, 3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_or_b32_e32 v7, v7, v12
+; CI-NEXT:    v_or_b32_e32 v2, v2, v12
 ; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_or_b32_e32 v6, v6, v12
-; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; CI-NEXT:    s_cmp_eq_u32 s5, 3
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; CI-NEXT:    v_or_b32_e32 v0, v0, v12
+; CI-NEXT:    v_cndmask_b32_e32 v12, v17, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 1
 ; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CI-NEXT:    s_cmp_eq_u32 s5, 0
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v10, vcc
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
 ; CI-NEXT:    s_cselect_b64 vcc, -1, 0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; CI-NEXT:    v_or_b32_e32 v5, v5, v10
+; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; CI-NEXT:    v_or_b32_e32 v1, v1, v13
+; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; CI-NEXT:    v_or_b32_e32 v5, v5, v10
+; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; CI-NEXT:    v_or_b32_e32 v7, v7, v13
 ; CI-NEXT:    v_or_b32_e32 v4, v4, v10
 ; CI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; CI-NEXT:    s_nop 0

diff  --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
index 4aa7738ef6b42..1efaf5e621ab3 100644
--- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
+++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
@@ -36,6 +36,14 @@ body:             |
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_3:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_4:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
@@ -47,22 +55,14 @@ body:             |
   ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_2]], implicit $exec
   ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_3]], implicit $exec
   ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_4]], implicit $exec
-  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_5]], implicit $exec
-  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_6]], implicit $exec
-  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_7]], implicit $exec
-  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_8]], implicit $exec
-  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_9]], implicit $exec
-  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_10]], implicit $exec
-  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_11]], implicit $exec
-  ; GCN-NEXT:   [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_12]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_5]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_6]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_7]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_8]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_9]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec
   ; GCN-NEXT:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec
   ; GCN-NEXT:   [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 4ed575df2fbb3..1aed1de7cf08b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -493,51 +493,51 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
 ; SI-NEXT:    s_bfe_u32 s7, s19, 0xb0014
 ; SI-NEXT:    s_addk_i32 s7, 0xfc01
 ; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s7
+; SI-NEXT:    v_mov_b32_e32 v13, s5
 ; SI-NEXT:    s_andn2_b64 s[8:9], s[18:19], s[8:9]
 ; SI-NEXT:    s_and_b32 s10, s19, 0x80000000
+; SI-NEXT:    v_mov_b32_e32 v12, s4
 ; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    v_add_f64 v[12:13], s[12:13], -v[12:13]
 ; SI-NEXT:    s_cselect_b32 s8, 0, s8
 ; SI-NEXT:    s_cselect_b32 s9, s10, s9
 ; SI-NEXT:    s_cmp_gt_i32 s7, 51
 ; SI-NEXT:    s_cselect_b32 s9, s19, s9
-; SI-NEXT:    s_cselect_b32 s8, s18, s8
-; SI-NEXT:    s_bfe_u32 s7, s17, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v13, s5
-; SI-NEXT:    s_addk_i32 s7, 0xfc01
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], s7
-; SI-NEXT:    v_add_f64 v[12:13], s[12:13], -v[12:13]
-; SI-NEXT:    s_andn2_b64 s[10:11], s[16:17], s[10:11]
-; SI-NEXT:    s_and_b32 s12, s17, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s7, 0
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5
+; SI-NEXT:    s_cselect_b32 s8, s18, s8
 ; SI-NEXT:    v_mov_b32_e32 v13, s9
-; SI-NEXT:    s_cselect_b32 s10, 0, s10
-; SI-NEXT:    s_cselect_b32 s11, s12, s11
-; SI-NEXT:    s_cmp_gt_i32 s7, 51
 ; SI-NEXT:    v_mov_b32_e32 v12, s8
-; SI-NEXT:    s_cselect_b32 s11, s17, s11
 ; SI-NEXT:    v_mov_b32_e32 v9, s13
 ; SI-NEXT:    v_add_f64 v[12:13], s[18:19], -v[12:13]
-; SI-NEXT:    s_cselect_b32 s10, s16, s10
-; SI-NEXT:    v_mov_b32_e32 v15, s11
 ; SI-NEXT:    v_bfi_b32 v9, s6, v8, v9
-; SI-NEXT:    v_mov_b32_e32 v14, s10
 ; SI-NEXT:    v_cndmask_b32_e32 v17, 0, v9, vcc
 ; SI-NEXT:    v_mov_b32_e32 v9, s19
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5
-; SI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[14:15]
 ; SI-NEXT:    v_bfi_b32 v9, s6, v8, v9
+; SI-NEXT:    s_bfe_u32 s7, s17, 0xb0014
 ; SI-NEXT:    v_cndmask_b32_e32 v13, 0, v9, vcc
+; SI-NEXT:    v_mov_b32_e32 v12, 0
+; SI-NEXT:    s_addk_i32 s7, 0xfc01
+; SI-NEXT:    v_add_f64 v[14:15], s[8:9], v[12:13]
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s7
+; SI-NEXT:    s_andn2_b64 s[8:9], s[16:17], s[8:9]
+; SI-NEXT:    s_and_b32 s10, s17, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b32 s8, 0, s8
+; SI-NEXT:    s_cselect_b32 s9, s10, s9
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b32 s9, s17, s9
+; SI-NEXT:    s_cselect_b32 s8, s16, s8
+; SI-NEXT:    v_mov_b32_e32 v13, s9
+; SI-NEXT:    v_mov_b32_e32 v12, s8
+; SI-NEXT:    v_add_f64 v[12:13], s[16:17], -v[12:13]
 ; SI-NEXT:    v_mov_b32_e32 v9, s17
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5
 ; SI-NEXT:    v_bfi_b32 v8, s6, v8, v9
-; SI-NEXT:    v_mov_b32_e32 v12, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v8, vcc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-NEXT:    v_mov_b32_e32 v16, 0
-; SI-NEXT:    v_add_f64 v[14:15], s[8:9], v[12:13]
-; SI-NEXT:    v_add_f64 v[12:13], s[10:11], v[8:9]
+; SI-NEXT:    v_add_f64 v[12:13], s[8:9], v[8:9]
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_add_f64 v[8:9], s[4:5], v[16:17]
 ; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
@@ -549,15 +549,15 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
 ; CI-LABEL: round_v8f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
-; CI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s23, 0xf000
+; CI-NEXT:    s_brev_b32 s20, -2
+; CI-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[6:7]
 ; CI-NEXT:    v_mov_b32_e32 v4, s7
 ; CI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
-; CI-NEXT:    v_bfi_b32 v4, s2, v16, v4
+; CI-NEXT:    v_bfi_b32 v4, s20, v20, v4
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
@@ -566,7 +566,7 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
 ; CI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[4:5]
 ; CI-NEXT:    v_mov_b32_e32 v6, s5
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
-; CI-NEXT:    v_bfi_b32 v6, s2, v16, v6
+; CI-NEXT:    v_bfi_b32 v6, s20, v20, v6
 ; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
 ; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[10:11]
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -574,7 +574,7 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
 ; CI-NEXT:    v_add_f64 v[4:5], s[10:11], -v[6:7]
 ; CI-NEXT:    v_mov_b32_e32 v8, s11
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; CI-NEXT:    v_bfi_b32 v8, s2, v16, v8
+; CI-NEXT:    v_bfi_b32 v8, s20, v20, v8
 ; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
 ; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[8:9]
 ; CI-NEXT:    v_mov_b32_e32 v4, 0
@@ -582,47 +582,47 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
 ; CI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[8:9]
 ; CI-NEXT:    v_mov_b32_e32 v10, s9
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; CI-NEXT:    v_bfi_b32 v10, s2, v16, v10
+; CI-NEXT:    v_bfi_b32 v10, s20, v20, v10
 ; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v10, vcc
+; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[14:15]
 ; CI-NEXT:    v_mov_b32_e32 v4, 0
 ; CI-NEXT:    v_add_f64 v[4:5], v[8:9], v[4:5]
-; CI-NEXT:    v_mov_b32_e32 v8, s15
-; CI-NEXT:    v_bfi_b32 v18, s2, v16, v8
-; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[16:17]
-; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[18:19]
-; CI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[8:9]
-; CI-NEXT:    v_mov_b32_e32 v19, s19
+; CI-NEXT:    v_add_f64 v[8:9], s[14:15], -v[10:11]
+; CI-NEXT:    v_mov_b32_e32 v12, s15
+; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
+; CI-NEXT:    v_bfi_b32 v12, s20, v20, v12
+; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[12:13]
+; CI-NEXT:    v_cndmask_b32_e32 v9, 0, v12, vcc
+; CI-NEXT:    v_mov_b32_e32 v8, 0
+; CI-NEXT:    v_add_f64 v[10:11], v[10:11], v[8:9]
+; CI-NEXT:    v_add_f64 v[8:9], s[12:13], -v[16:17]
+; CI-NEXT:    v_mov_b32_e32 v12, s13
+; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
+; CI-NEXT:    v_bfi_b32 v12, s20, v20, v12
+; CI-NEXT:    v_cndmask_b32_e32 v9, 0, v12, vcc
+; CI-NEXT:    v_trunc_f64_e32 v[12:13], s[18:19]
+; CI-NEXT:    v_mov_b32_e32 v18, s19
+; CI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[12:13]
+; CI-NEXT:    v_bfi_b32 v18, s20, v20, v18
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; CI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[10:11]
-; CI-NEXT:    v_mov_b32_e32 v17, s17
-; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
-; CI-NEXT:    v_bfi_b32 v19, s2, v16, v19
-; CI-NEXT:    v_trunc_f64_e32 v[12:13], s[12:13]
-; CI-NEXT:    v_bfi_b32 v17, s2, v16, v17
-; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v19, s[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v14, 0
-; CI-NEXT:    v_add_f64 v[10:11], v[10:11], v[14:15]
-; CI-NEXT:    v_cndmask_b32_e32 v15, 0, v17, vcc
-; CI-NEXT:    v_mov_b32_e32 v14, 0
-; CI-NEXT:    v_mov_b32_e32 v17, s13
-; CI-NEXT:    v_add_f64 v[8:9], v[8:9], v[14:15]
-; CI-NEXT:    v_add_f64 v[14:15], s[12:13], -v[12:13]
-; CI-NEXT:    v_bfi_b32 v19, s2, v16, v17
-; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[14:15]
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; CI-NEXT:    v_add_f64 v[14:15], s[14:15], -v[16:17]
-; CI-NEXT:    s_mov_b32 s22, -1
-; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
-; CI-NEXT:    v_mov_b32_e32 v14, 0
-; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v18, s[0:1]
-; CI-NEXT:    v_add_f64 v[14:15], v[16:17], v[14:15]
-; CI-NEXT:    v_cndmask_b32_e32 v17, 0, v19, vcc
-; CI-NEXT:    v_mov_b32_e32 v16, 0
-; CI-NEXT:    v_add_f64 v[12:13], v[12:13], v[16:17]
-; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48
-; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32
-; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
+; CI-NEXT:    v_cndmask_b32_e32 v15, 0, v18, vcc
+; CI-NEXT:    v_trunc_f64_e32 v[18:19], s[16:17]
+; CI-NEXT:    v_add_f64 v[14:15], v[12:13], v[14:15]
+; CI-NEXT:    v_add_f64 v[12:13], s[16:17], -v[18:19]
+; CI-NEXT:    v_mov_b32_e32 v21, s17
+; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5
+; CI-NEXT:    v_bfi_b32 v20, s20, v20, v21
+; CI-NEXT:    v_cndmask_b32_e32 v13, 0, v20, vcc
+; CI-NEXT:    v_mov_b32_e32 v12, 0
+; CI-NEXT:    v_mov_b32_e32 v8, 0
+; CI-NEXT:    v_add_f64 v[12:13], v[18:19], v[12:13]
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_add_f64 v[8:9], v[16:17], v[8:9]
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
   %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
   store <8 x double> %result, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 8946846898f85..971ae8ea46d75 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2074,27 +2074,29 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s19, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s1, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s0, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s3, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
@@ -2104,60 +2106,56 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s36
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
@@ -2196,32 +2194,32 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s12, s12, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s15, s15, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s0, s15, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s1, s14, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s31
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s30
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
@@ -2460,90 +2458,88 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s19, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s18, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s19, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s20, s1
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s21, s0
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s21, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s1, s1
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s0, s0
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s22, s3, 16
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s24, s3
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s25, s2
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s26, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s2, s2
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s24, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s25, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s28, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s29, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s26, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s30, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s31, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s28, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s29, s8, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s30, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s31, s10, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s10, s10
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s36, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s12, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s13, s13
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s37, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s36, s14, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s12, s12
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s36
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s37
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
@@ -2554,6 +2550,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_ashr_i32 s18, s1, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s19, s0, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s20, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s21, s0
 ; GCN-HSA-NEXT:    s_ashr_i32 s22, s3, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s23, s2, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s24, s5, 16
@@ -2566,36 +2564,34 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_ashr_i32 s31, s10, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s33, s13, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s34, s12, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s35, s15, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s36, s14, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s21, s0
+; GCN-HSA-NEXT:    s_ashr_i32 s0, s15, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s1, s14, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
-; GCN-HSA-NEXT:    s_sext_i32_i16 s20, s1
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s36
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s34
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s33
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s11
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
+; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s10, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
@@ -2910,18 +2906,18 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s21, s21, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s20, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s23, s23, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s22, s22, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s25, s25, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s24, s24, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s27, s27, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s26, s26, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s29, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s28, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s31, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s30, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s21, s21, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s26, s26, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s22, s22, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s36
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s37
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
@@ -2941,21 +2937,22 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s24
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s64
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s63
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s62
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s23
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s63
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s62
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s61
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s61
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s60
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s59
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
@@ -3093,96 +3090,96 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xa0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s61
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s59
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s63
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x80
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s66
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s65
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s62
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s61
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s59
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s58
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s7
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s57
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[27:28], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s63
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s54
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s68
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s53
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s19
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s67
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s52
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s18
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s42
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s56
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s54
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s53
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s41
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s41
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s40
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s49
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[20:23]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
@@ -3237,121 +3234,113 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ;
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s39, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s38, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x40
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s36, s16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s37, s17
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[16:31], s[18:19], 0x40
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s69, s31, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s70, s30, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s51, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s67, s31, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s68, s30, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s31, s31, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s30, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s1, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s0, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s3, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s40, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s60, s1, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s61, s0, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s62, s3, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s63, s2, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s67, s29, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s68, s28, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s65, s29, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s28, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s29, s29, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s28, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s36
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s70
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s68
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s69
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s65, s27, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s26, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s63, s27, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s64, s26, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s27, s27, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s26, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s59, s25, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s61, s25, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s68
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s66
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s29
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s64, s24, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s24, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s25, s25, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s24, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s57, s23, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s59, s23, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s66
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s64
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s22, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s63
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s22, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s23, s23, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s22, s22, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s55, s21, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s57, s21, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s62
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s25
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s56, s20, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s20, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s21, s21, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s20, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s53, s19, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s55, s19, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s60
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s57
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s18, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s56, s18, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s18, s18, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s51, s17, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s53, s17, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s58
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s16, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s57
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s16, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s17, s17, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s16, s16, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s49, s15, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s56
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s53
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s50, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s15, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s13, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s49, s13, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s52
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s54
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s39, s12, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s53
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s50, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s47, s11, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s50
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s52
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s48, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s45, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s50
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s38
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s46, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s43, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s48
@@ -3360,7 +3349,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s44, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s41, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s46
@@ -3369,31 +3358,37 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s3, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s44
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s40, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s1, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s42
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s41
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s63
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s40
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s62
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s61
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v64i16_to_v64i32:
@@ -3636,17 +3631,16 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s60, s20, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s21, s21
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s20, s20
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s61, s22, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s62, s23
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s61, s23, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s62, s22, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s23, s23
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s22, s22
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s63, s25, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s64, s24, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s25, s25
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s63, s24, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s64, s25
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s24, s24
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s65, s27, 16
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s66, s26, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s27, s27
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s26, s26
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s67, s29, 16
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s68, s28, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s29, s29
@@ -3655,7 +3649,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s70, s30, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s31, s31
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s30, s30
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s23, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s26, s26
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s25, s25, 16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s36
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s37
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
@@ -3673,23 +3668,24 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s27
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s65
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s63
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s62
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s63
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s64
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s25
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s62
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s61
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s23
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s60
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s59
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
@@ -3813,110 +3809,110 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s53, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s1
-; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s11
-; GCN-HSA-NEXT:    s_sext_i32_i16 s10, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xa0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[8:11]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s9, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s1
-; GCN-HSA-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
+; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[12:15]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x80
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s1
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-HSA-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-HSA-NEXT:    s_sext_i32_i16 s8, s8
+; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s11
+; GCN-HSA-NEXT:    s_sext_i32_i16 s10, s10
+; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s15
+; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-HSA-NEXT:    s_sext_i32_i16 s2, s2
+; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s60
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s7
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s59
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[27:28], v[16:19]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
-; GCN-HSA-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s2, s2
-; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s54
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s19
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s53
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s52
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s18
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s50
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s58
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s56
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s49
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s46
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s47
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s45
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[20:23]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
@@ -3971,161 +3967,161 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ;
 ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s39, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s38, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x0
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x40
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s36, s0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s37, s1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x40
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s49, s31, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s69, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s70, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s51, s31, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s52, s30, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s31, s31
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s67, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s68, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s51, s1, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s52, s0, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s53, s1
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s54, s0
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s55, s3, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s56, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s57, s3
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s58, s2
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s67, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s68, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s65, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s66, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s13, s13
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s36
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s70
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s68
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s69
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s65, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s66, s10, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s63, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s64, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s10, s10
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s63, s9, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s61, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s68
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s66
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s64, s8, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s62, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s61, s7, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s59, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s66
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s64
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s62, s6, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s63
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s60, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s59, s5, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s57, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s62
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s63
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s60, s4, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s58, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s50, s30, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s55, s3, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s62
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s60
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s31, s31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s56, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s2, s2
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s53, s1, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s58
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s57
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s54, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s1, s1
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s30, s30
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s57
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s47, s29, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s52
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s48, s28, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s49, s29, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s53
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s50, s28, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s29, s29
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s28, s28
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s45, s27, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s47, s27, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s50
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s52
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s46, s26, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s48, s26, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s27, s27
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s26, s26
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s43, s25, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s45, s25, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s50
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s29
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s47
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s44, s24, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s46, s24, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s25, s25
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s24, s24
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s41, s23, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s43, s23, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s46
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s48
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s45
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s42, s22, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s47
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s44, s22, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s23, s23
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s22, s22
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s39, s21, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s41, s21, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s44
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s46
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s25
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s40, s20, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s42, s20, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s21, s21
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s20, s20
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s35, s19, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s44
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s41
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s38, s18, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s40, s18, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s19, s19
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s18, s18
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s33, s17, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s40
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s42
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s41
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s34, s16, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s17, s17
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s16, s16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s38
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s40
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v64i16_to_v64i32:
@@ -5777,55 +5773,57 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_mov_b32 s6, s15
-; GCN-HSA-NEXT:    s_mov_b32 s16, s13
-; GCN-HSA-NEXT:    s_mov_b32 s18, s11
-; GCN-HSA-NEXT:    s_mov_b32 s20, s9
-; GCN-HSA-NEXT:    s_lshr_b32 s22, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s24, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s26, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s28, s8, 16
-; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[14:15], s[14:15], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-HSA-NEXT:    s_mov_b32 s16, s15
+; GCN-HSA-NEXT:    s_mov_b32 s18, s13
+; GCN-HSA-NEXT:    s_mov_b32 s20, s11
+; GCN-HSA-NEXT:    s_mov_b32 s22, s9
+; GCN-HSA-NEXT:    s_lshr_b32 s24, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s26, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s28, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s4, s8, 16
 ; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[8:9], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[10:11], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[10:11], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[8:9], s[8:9], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[10:11], s[10:11], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[12:13], s[12:13], 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[28:29], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[26:27], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[14:15], s[14:15], 48
+; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GCN-HSA-NEXT:    s_add_u32 s26, s0, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s27, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s15
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s12
 ; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s13
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    s_nop 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-HSA-NEXT:    s_add_u32 s10, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s11
 ; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_nop 0
@@ -5834,8 +5832,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0x60
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -5843,8 +5841,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s25
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-HSA-NEXT:    s_add_u32 s8, s0, 64
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -5852,25 +5850,25 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s27
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_nop 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s28
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s29
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -6053,108 +6051,106 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s19, 0xf000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, -1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s0, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s1, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s3, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s37
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s29
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s30
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s29
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s30
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s33
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s34
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s33
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s34
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s36
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
@@ -6196,43 +6192,43 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s13, s13, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s0, s15, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s25
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -6579,95 +6575,97 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s15
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, s13
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s40, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s42, s9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s46, s7
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s44, s5
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s36, s3
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s38, s1
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[48:49], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[50:51], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s54, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s56, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s22, s11
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s24, s9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s26, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s28, s5
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s30, s3
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s34, s1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s44, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[52:53], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[54:55], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[56:57], s[22:23], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[58:59], s[6:7], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[60:61], s[8:9], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[62:63], s[10:11], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[64:65], s[12:13], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[66:67], s[14:15], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[68:69], s[0:1], 48
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[70:71], s[2:3], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[6:7], s[6:7], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[8:9], s[8:9], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[72:73], s[4:5], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[2:3], s[8:9], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[4:5], s[12:13], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[8:9], s[14:15], 48
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[10:11], s[10:11], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[2:3], s[12:13], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[12:13], s[14:15], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[12:13], s[6:7], 48
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s51
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s49
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s2
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s55
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s52
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s53
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s56
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s57
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s3
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[46:47], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[42:43], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[44:45], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[30:31], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[28:29], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[50:51], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[48:49], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[46:47], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[44:45], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[40:41], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[38:39], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s40
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s41
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[58:59], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[56:57], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[54:55], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[52:53], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[30:31], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[28:29], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s72
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s73
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s70
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s71
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s68
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -6680,39 +6678,41 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s63
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s60
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s58
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s59
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s37
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s35
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s15
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s31
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s13
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s29
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s27
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, s7
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s7
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
@@ -6722,21 +6722,21 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_mov_b32 s42, s15
-; GCN-HSA-NEXT:    s_mov_b32 s48, s13
-; GCN-HSA-NEXT:    s_mov_b32 s50, s11
-; GCN-HSA-NEXT:    s_mov_b32 s52, s9
-; GCN-HSA-NEXT:    s_mov_b32 s54, s7
-; GCN-HSA-NEXT:    s_mov_b32 s56, s5
-; GCN-HSA-NEXT:    s_mov_b32 s44, s3
-; GCN-HSA-NEXT:    s_mov_b32 s58, s1
-; GCN-HSA-NEXT:    s_lshr_b32 s60, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s62, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s64, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s66, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s68, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s70, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s72, s2, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s74, s0, 16
+; GCN-HSA-NEXT:    s_mov_b32 s44, s13
+; GCN-HSA-NEXT:    s_mov_b32 s46, s11
+; GCN-HSA-NEXT:    s_mov_b32 s48, s9
+; GCN-HSA-NEXT:    s_mov_b32 s50, s7
+; GCN-HSA-NEXT:    s_mov_b32 s52, s5
+; GCN-HSA-NEXT:    s_mov_b32 s54, s3
+; GCN-HSA-NEXT:    s_mov_b32 s56, s1
+; GCN-HSA-NEXT:    s_lshr_b32 s58, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s60, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s62, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s64, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s66, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s68, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s70, s2, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s72, s0, 16
 ; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[36:37], s[0:1], 48
@@ -6750,7 +6750,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[40:41], s[4:5], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[46:47], s[6:7], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[74:75], s[6:7], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[76:77], s[8:9], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[78:79], s[10:11], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[80:81], s[12:13], 48
@@ -6758,107 +6758,109 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-HSA-NEXT:    s_bfe_i64 s[0:1], s[74:75], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[72:73], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[70:71], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[68:69], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[66:67], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[64:65], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[62:63], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[60:61], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[58:59], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[0:1], s[72:73], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[70:71], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[68:69], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[66:67], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[64:65], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[62:63], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[60:61], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[58:59], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[56:57], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
-; GCN-HSA-NEXT:    s_add_u32 s58, s16, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s59, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
-; GCN-HSA-NEXT:    s_add_u32 s48, s16, 0xd0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s49
-; GCN-HSA-NEXT:    s_addc_u32 s49, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s49
-; GCN-HSA-NEXT:    s_add_u32 s48, s16, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s49, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s46
-; GCN-HSA-NEXT:    s_add_u32 s46, s16, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s59
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s47
-; GCN-HSA-NEXT:    s_addc_u32 s47, s17, 0
+; GCN-HSA-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
+; GCN-HSA-NEXT:    s_add_u32 s56, s16, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s57, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s44
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0xd0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s45
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s45
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s45
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0x90
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s45
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0x70
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s56
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s45
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s40
-; GCN-HSA-NEXT:    s_add_u32 s40, s16, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s41
-; GCN-HSA-NEXT:    s_addc_u32 s41, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s41
-; GCN-HSA-NEXT:    s_add_u32 s40, s16, 0x50
-; GCN-HSA-NEXT:    s_addc_u32 s41, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s80
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s81
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s38
 ; GCN-HSA-NEXT:    s_add_u32 s38, s16, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s81
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-HSA-NEXT:    s_addc_u32 s39, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s47
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s78
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s79
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s49
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
-; GCN-HSA-NEXT:    s_addc_u32 s39, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s38
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s39
-; GCN-HSA-NEXT:    s_add_u32 s38, s16, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s41
-; GCN-HSA-NEXT:    s_addc_u32 s39, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s46
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s14
-; GCN-HSA-NEXT:    s_add_u32 s14, s16, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s52
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s36
+; GCN-HSA-NEXT:    s_add_u32 s36, s16, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s49
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s76
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s77
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s47
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s15
-; GCN-HSA-NEXT:    s_addc_u32 s15, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s74
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s75
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s44
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s37
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s38
+; GCN-HSA-NEXT:    s_addc_u32 s37, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s39
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s54
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s37
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-HSA-NEXT:    s_add_u32 s14, s16, 0xe0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-HSA-NEXT:    s_addc_u32 s15, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s35
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s15
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s12
 ; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xc0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s55
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s45
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s38
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s42
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s43
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s39
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s37
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s12
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    s_nop 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-HSA-NEXT:    s_add_u32 s10, s16, 0xa0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s11
@@ -6922,141 +6924,139 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s19, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s18, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s36, s15
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s38, s13
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[82:83], s[14:15], 48
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s40, s11
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s48, s3
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s50, s1
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s64, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s0, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[68:69], s[0:1], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[70:71], s[2:3], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[80:81], s[12:13], 48
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s38, s15
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s40, s13
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[14:15], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[14:15], s[14:15], 48
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s82
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s83
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s42, s9
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[78:79], s[10:11], 48
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s42, s11
+; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[82:83], s[12:13], 48
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s44, s7
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s38
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s39
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s80
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s81
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[76:77], s[8:9], 48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s44, s9
+; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[80:81], s[10:11], 48
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s46, s5
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s46, s7
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s40
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s41
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s78
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s79
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[74:75], s[6:7], 48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s82
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s83
+; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[78:79], s[8:9], 48
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[72:73], s[4:5], 48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s48, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s42
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s43
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s76
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s77
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s80
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s81
+; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[76:77], s[6:7], 48
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s50, s3
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s44
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s45
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s74
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s75
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s14, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s78
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s79
+; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[74:75], s[4:5], 48
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s52, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s46
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s47
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s72
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s73
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s76
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s77
+; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[72:73], s[2:3], 48
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s12, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[70:71], s[0:1], 48
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s48
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s49
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s70
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s71
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s74
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s75
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s56, s10, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s56, s12, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s51
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s68
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s69
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[16:17], s[54:55], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s8, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s72
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s73
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s10, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s52
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s70
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s71
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[56:57], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s8, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[58:59], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s6, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s35
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s52
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s53
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[56:57], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s6, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[60:61], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s64, s4, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[58:59], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s4, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[62:63], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s2, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[60:61], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[64:65], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s68, s0, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[62:63], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[64:65], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[2:3], s[66:67], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[0:1], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[66:67], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[0:1], s[68:69], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v32i16_to_v32i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 5a8a3114a0a9b..0c433240f5f95 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -2144,17 +2144,17 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v6
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v1
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v5
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
@@ -2164,50 +2164,50 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v6
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[1:4]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[5:6], v[8:11]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32:
@@ -2675,94 +2675,94 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 64
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0x70
+; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s8, s0, 64
+; GCN-HSA-NEXT:    s_add_u32 s8, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0x50
-; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s12, s0, 32
-; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
 ; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v1
 ; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
 ; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
 ; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v8
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v10
-; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v15
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[7:10]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[3:6]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v12
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v15
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[11:14]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[16:19]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32:
@@ -3102,78 +3102,78 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v1
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v0
 ; GCN-HSA-NEXT:    v_bfe_i32 v18, v1, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v16, v0, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v2
-; GCN-HSA-NEXT:    v_bfe_i32 v18, v3, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v2, 0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v5, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v4, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v2
+; GCN-HSA-NEXT:    v_bfe_i32 v18, v3, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v2, 0, 16
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v6, 0, 16
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
 ; GCN-HSA-NEXT:    v_bfe_i32 v6, v9, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v4, v8, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v11, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v10, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v13
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v12
 ; GCN-HSA-NEXT:    v_bfe_i32 v6, v13, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v4, v12, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v15
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v14
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v15, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v14, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32:
@@ -3587,18 +3587,18 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
@@ -3606,172 +3606,171 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT:    s_add_u32 s8, s2, 48
-; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s6
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 64
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s8
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s4
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[32:33]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v30, 0xffff, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v28, 0xffff, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v20
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[24:27]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s5
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xf0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s4
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v30, 0xffff, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v28, 0xffff, v14
 ; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xa0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v26, 16, v23
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GCN-HSA-NEXT:    v_and_b32_e32 v25, 0xffff, v23
+; GCN-HSA-NEXT:    v_and_b32_e32 v23, 0xffff, v22
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[28:31]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[23:26]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v23, 16, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v20, 0xffff, v16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s9
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v19
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v18
+; GCN-HSA-NEXT:    v_and_b32_e32 v21, 0xffff, v19
+; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[11:14]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s6
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s7
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v14
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s5
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[15:18]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s3
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v8
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v19
-; GCN-HSA-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[5:6], v[1:4]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[16:19]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[11:14]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v26, 16, v33
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v24, 16, v32
+; GCN-HSA-NEXT:    v_and_b32_e32 v25, 0xffff, v33
+; GCN-HSA-NEXT:    v_and_b32_e32 v23, 0xffff, v32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[23:26]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v34
+; GCN-HSA-NEXT:    v_and_b32_e32 v21, 0xffff, v35
+; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[15:18]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v29
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v28
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v29
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v28
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v33
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v33
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v32
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v25
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v24
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v26
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v27
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v35
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v35
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v31
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v30
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v31
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v30
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v21
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v21
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v23
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32:
@@ -4378,192 +4377,189 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    s_add_u32 s8, s2, 64
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 64
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s4
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s6
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s4
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[32:33]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v13
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v12
-; GCN-HSA-NEXT:    v_bfe_i32 v26, v13, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v24, v12, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[24:27]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v21
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v20
+; GCN-HSA-NEXT:    v_bfe_i32 v30, v21, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v28, v20, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v15
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v14
-; GCN-HSA-NEXT:    v_bfe_i32 v26, v15, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v24, v14, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[24:27]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v23
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v22
+; GCN-HSA-NEXT:    v_bfe_i32 v30, v23, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v28, v22, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[28:31]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v8
-; GCN-HSA-NEXT:    v_bfe_i32 v14, v9, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v12, v8, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s4
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v17
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v16
+; GCN-HSA-NEXT:    v_bfe_i32 v22, v17, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v20, v16, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[20:23]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
-; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
+; GCN-HSA-NEXT:    v_bfe_i32 v21, v19, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v19, v18, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[11:14]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[19:22]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v13
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v12
+; GCN-HSA-NEXT:    v_bfe_i32 v18, v13, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v12, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v39, s3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v15
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v14
+; GCN-HSA-NEXT:    v_bfe_i32 v22, v15, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v20, v14, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v38, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[20:23]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v9
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v8
+; GCN-HSA-NEXT:    v_bfe_i32 v17, v9, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v15, v8, 0, 16
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v5
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v5, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v4, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v6
-; GCN-HSA-NEXT:    v_bfe_i32 v14, v7, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v12, v6, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
+; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[15:18]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[11:14]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
-; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v0
+; GCN-HSA-NEXT:    v_bfe_i32 v17, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v15, v0, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[12:15]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v3, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v17
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v16
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v17, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v16, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v19, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v18, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v6
+; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v13, v5, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v11, v4, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 16, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 16, v2
+; GCN-HSA-NEXT:    v_bfe_i32 v5, v3, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v2, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v26
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT:    v_bfe_i32 v15, v26, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v25
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v24
+; GCN-HSA-NEXT:    v_bfe_i32 v21, v25, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v19, v24, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v26, 16, v33
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v24, 16, v32
+; GCN-HSA-NEXT:    v_bfe_i32 v25, v33, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v23, v32, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[23:26]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v35
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v34
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v35, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v34, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v23
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v22
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v23, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v22, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v29
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v28
-; GCN-HSA-NEXT:    v_bfe_i32 v14, v29, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v12, v28, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v27
+; GCN-HSA-NEXT:    v_bfe_i32 v17, v27, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v30
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v31, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v30, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v21
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v20
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v21, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v20, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(14)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v33
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v32
-; GCN-HSA-NEXT:    v_bfe_i32 v6, v33, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v32, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v34
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v35, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v34, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32:
@@ -6060,45 +6056,45 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[11:14]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v12
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[3:6]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64:
@@ -6257,22 +6253,22 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[10:11], v[0:1], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v5, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[14:15], v[2:3], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[14:15], v[0:1], 48
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v6, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v7, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -6285,40 +6281,40 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[0:1], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GCN-HSA-NEXT:    v_ashr_i64 v[14:15], v[0:1], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v1, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v5, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[2:3], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v11, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v10, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v7, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64:
@@ -6337,15 +6333,15 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v11, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v5, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v6, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v10, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
@@ -6480,65 +6476,65 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v3
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v27, 0xffff, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v21
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, v8
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, 0
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
@@ -6547,64 +6543,65 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v5
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v7
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v7
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 0x60
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v8
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v3
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[1:2], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[7:10]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[2:5]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[8:11]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64:
@@ -6621,35 +6618,34 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v27, 0xffff, v4
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v4
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, v4
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v7
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v4
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v4
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v1
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, v4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v4
@@ -6658,6 +6654,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
@@ -6857,51 +6854,49 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, v7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v3
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v8, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[10:11], v[6:7], 48
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[9:10], v[4:5], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v7, v5, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v5, v0, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v7, v15, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v12, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[10:11], v[2:3], 48
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[11:12], v[0:1], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v9, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v0, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[14:15], v[0:1], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v10, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[2:3], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v1, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v2, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v14, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v13, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v4, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v17, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v13, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v20, v4, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v11, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v9, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[25:26], v[6:7], 48
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[26:27], v[4:5], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v24, v5, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v6, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v6, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
@@ -6930,66 +6925,66 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[0:1], 48
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[2:3], 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v3, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v10, v16, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[4:5], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v5, 0, 16
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v8, 0, 16
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v4, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v6, 0, 16
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GCN-HSA-NEXT:    v_ashr_i64 v[14:15], v[4:5], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v5, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v4, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v10, v9, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[6:7], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v11, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v17, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v16, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64:
@@ -7008,55 +7003,55 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v0, 0, 16
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v9, v5, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v7
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v5, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v9, v4, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v9, v13, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v4, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v0, 0, 16
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v10, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v22, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v2, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v4, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v6, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v7
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v24, v6, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v4, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v11, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v3, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v9, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v13, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -7263,6 +7258,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v2
@@ -7280,14 +7278,13 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v5
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v6
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v38, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v38, 0xffff, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v32, v22
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v42, 0xffff, v9
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v10
@@ -7316,10 +7313,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v53, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v43, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v45, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v39, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v41, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v33, v1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, v1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v35, v1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v37, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, v23
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -7332,8 +7329,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v59, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v47, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v35, v1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v39, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
@@ -7351,23 +7348,23 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v61, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, v14
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v37, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v41, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v33, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v49, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v61, 0
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
@@ -7379,8 +7376,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -7388,153 +7385,158 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[9:12], v[9:10]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 48
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[13:16], v[13:14]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s2
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[17:20], v[17:18]
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[14:17], v[0:1]
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 16
+; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xf0
+; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xd0
+; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0xb0
+; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x90
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x90
+; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s16, s0, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s17, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s17
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
+; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v1
+; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, v1
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s13
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s10
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s12
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s15
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s14
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v9
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s12
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v7
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s4
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[5:8]
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s11
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s10
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[5:8]
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v16
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[5:8]
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s9
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v15
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s1
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v10
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[17:20]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v16
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[9:12]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v23, 0xffff, v14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v19
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[5:8]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[23:26]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[5:8]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[5:8]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[19:22]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[17:20]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v1
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[15:18]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[5:8]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64:
@@ -7549,93 +7551,93 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[30:33], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[34:37], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v57, 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v56, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v55, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v53, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v46, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v50, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v41, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v43, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v39, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v59, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, v57
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v12, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v54, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v52, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v47, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v49, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v20, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v24, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v40, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v58, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v42, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v38, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v56
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v30
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, 0xffff, v30
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v32
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, 0xffff, v32
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, 0xffff, v31
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v33
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v45, 0xffff, v33
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v37, 0xffff, v29
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v31
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v41, 0xffff, v31
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v29, 0xffff, v30
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v32
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, 0xffff, v32
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v30, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v32, v56
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, 0xffff, v7
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v36
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v52, 0xffff, v36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v33, v57
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v35
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v51, 0xffff, v35
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v56
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v23, 0xffff, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v19, 0xffff, v2
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v4
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v15, 0xffff, v6
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v27, 0xffff, v7
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v34
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v49, 0xffff, v34
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v35
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, 0xffff, v35
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v37
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v56, 0xffff, v37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v37, v57
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v30, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v57
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v52, 0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v33
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, 0xffff, v33
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v36
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v55, 0xffff, v36
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v34, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v56
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, v56
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v51, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v44, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v56
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_zextload_v32i16_to_v32i64:
@@ -7981,105 +7983,101 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v15
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v16, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[14:15], 48
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v15
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v17, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[19:20], v[2:3], 48
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[12:13], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v13, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v3
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[19:20], v[0:1], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v13, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[2:3], 48
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v21, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[19:20], v[6:7], 48
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[0:1], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v7
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[19:20], v[4:5], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v5, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[6:7], 48
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v22, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[19:20], v[10:11], 48
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[4:5], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v5, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[19:20], v[8:9], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v9, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[10:11], 48
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v23, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[19:20], v[14:15], 48
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[8:9], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v9, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[19:20], v[12:13], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v13, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v14, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v16, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v2, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v11, v12, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v3, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v11, v2, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v3, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v1, v8, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v5, v10, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v1, v12, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v5, v14, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v11, v9, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v9, v4, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v9, v8, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v10, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v21, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v24, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v4, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v6, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v6, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v2, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v0, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v25, v0, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v27, v2, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
@@ -8093,7 +8091,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
@@ -8101,9 +8099,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
@@ -8111,154 +8109,154 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[8:9], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v9, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[4:5], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v5, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xd0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x90
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v7
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xb0
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[6:7], 48
+; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0x70
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0x90
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
+; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0x50
+; GCN-HSA-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v6, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v5, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0x70
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GCN-HSA-NEXT:    s_add_u32 s10, s0, 32
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
 ; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v11
-; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x50
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v9, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[10:11], 48
-; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT:    s_add_u32 s14, s0, 32
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_bfe_i32 v18, v9, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v10, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s14
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[0:1], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s11
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s10
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[2:3], 48
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s15
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v8, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v9, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s5
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
-; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[0:1], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s4
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[2:3], 48
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s9
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s5
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[4:5], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v5, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s8
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[6:7], 48
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s13
+; GCN-HSA-NEXT:    v_ashr_i64 v[5:6], v[8:9], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v9, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s4
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[3:6]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v11
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[5:6], v[10:11], 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s9
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[3:6]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashr_i64 v[9:10], v[12:13], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v7, v13, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT:    v_ashr_i64 v[5:6], v[12:13], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v13, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[3:6]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[7:10]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-HSA-NEXT:    v_bfe_i32 v7, v3, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[9:10], v[14:15], 48
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[5:6], v[14:15], 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s6
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[3:6]
+; GCN-HSA-NEXT:    v_bfe_i32 v19, v0, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v23, v2, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v21, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v25, v25, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_bfe_i32 v7, v2, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v9, v1, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[23:26]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[1:2], v[7:10]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
+; GCN-HSA-NEXT:    v_bfe_i32 v15, v10, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v17, v18, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v11, v4, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v17, v5, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_bfe_i32 v15, v6, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
+; GCN-HSA-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v7, v8, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v9, v9, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v3, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[15:18]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT:    v_bfe_i32 v13, v13, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
-; GCN-HSA-NEXT:    v_bfe_i32 v7, v14, 0, 16
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v12, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v11, v14, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v12, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_bfe_i32 v9, v9, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[11:14]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[7:10]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64:
@@ -8271,95 +8269,95 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[18:19], 48, v[12:13]
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v13, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v14, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[22:23], 48, v[0:1]
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, v3
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[18:19], 48, v[0:1]
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[22:23], 48, v[4:5]
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v19, 0, 16
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[18:19], 48, v[4:5]
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v5, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[22:23], 48, v[8:9]
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v9, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v7
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v5, v10, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[18:19], 48, v[8:9]
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[22:23], 48, v[12:13]
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v13, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v18, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[21:22], 48, v[2:3]
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v11
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[20:21], 48, v[6:7]
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v17, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[19:20], 48, v[10:11]
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v15
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v16, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[18:19], 48, v[14:15]
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[17:18], 48, v[2:3]
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v7
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[17:18], 48, v[6:7]
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v11
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[17:18], 48, v[10:11]
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v12, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v2, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v23, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v0, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v2, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v25, v0, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v27, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v3, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v1, v8, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v1, v12, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v5, v14, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v7, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v9, v4, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v6, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v9, v8, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v10, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v21, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v22, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v4, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v21, v6, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64

diff  --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
index 154860e1ac970..625eee703f693 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
@@ -1,6 +1,14 @@
 # RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
 # REQUIRES: asserts
 
+--- |
+  define void @sink_and_inc_idx_when_skipping_small_region_1() "amdgpu-flat-work-group-size"="1,64" {
+    ret void
+  }
+
+  define void @sink_and_inc_idx_when_skipping_small_regions_2() "amdgpu-flat-work-group-size"="1,64" {
+    ret void
+  }
 ---
 name:            sink_and_inc_idx_when_skipping_small_region_1
 tracksRegLiveness: true

diff  --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 62e83fc1767b3..5fb35f8816514 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -139,16 +139,16 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
@@ -248,14 +248,14 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -356,15 +356,15 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -464,27 +464,27 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_NOP 0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.3:
   ; GFX908-NEXT:   successors: %bb.4(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -600,29 +600,29 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_22]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_NOP 0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.3:
   ; GFX908-NEXT:   successors: %bb.4(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_25]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -722,7 +722,6 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
@@ -743,6 +742,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
   ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
@@ -758,8 +759,8 @@ body:             |
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.3:
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
@@ -772,8 +773,7 @@ body:             |
   ; GFX908-NEXT: bb.4:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.5:
@@ -1114,6 +1114,14 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
   ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
   ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
@@ -1186,19 +1194,12 @@ body:             |
   ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
   ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
   ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
   ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
   ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
@@ -1215,14 +1216,13 @@ body:             |
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.3:
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
   ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
   ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
@@ -1643,6 +1643,10 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
   ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
   ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
@@ -1715,10 +1719,6 @@ body:             |
   ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
   ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
   ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
   ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
   ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
@@ -1759,9 +1759,6 @@ body:             |
   ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.5:
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_10]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_11]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_12]]
@@ -1774,6 +1771,9 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_19]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_8]], implicit [[S_MOV_B32_9]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]]
@@ -2049,6 +2049,10 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
   ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
   ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
@@ -2121,13 +2125,9 @@ body:             |
   ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
   ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
   ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
   ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
   ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
@@ -2801,7 +2801,6 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
@@ -2823,6 +2822,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
   ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
@@ -2988,6 +2988,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
   ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
@@ -3003,10 +3004,9 @@ body:             |
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.3:
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
@@ -4974,13 +4974,13 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
@@ -5192,13 +5192,13 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
-  ; GFX908-NEXT:   [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_F64_I32_e32_]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
@@ -5297,6 +5297,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
@@ -5304,7 +5305,6 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
-  ; GFX908-NEXT:   [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_F64_I32_e32_]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
@@ -5726,17 +5726,17 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   DBG_VALUE %23, 0, 0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   DBG_VALUE [[V_CVT_I32_F64_e32_23]], 0, 0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
@@ -5836,17 +5836,17 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   INLINEASM &"v_or_b32 $0, 0, $1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def %22, 327689 /* reguse:SReg_1_with_sub0 */, [[V_CVT_I32_F64_e32_4]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
index 589dba2f5a569..ae21c38eceb02 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
@@ -261,9 +261,11 @@ body:             |
 # GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, implicit $exec
 # GCN-NEXT: dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, implicit $exec
 # GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec
+# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec
+# GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec
 # GCN-NEXT: KILL %0{{$}}
-# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 80, 0, implicit $exec
-# GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 96, 0, implicit $exec
+# GCN-NEXT: dead %9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec
+# GCN-NEXT: dead %10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec
 # GCN-NEXT: KILL %1{{$}}
 
 ---
@@ -278,8 +280,10 @@ body:             |
     %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, implicit $exec
     %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, implicit $exec
     %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec
-    %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 80, 0, implicit $exec
-    %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 96, 0, implicit $exec
+    %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec
+    %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec
+    %9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec
+    %10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: image_clause{{$}}

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 6a14b88eb630e..ba287349c5756 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -73,22 +73,22 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
 ; GFX9-NEXT:  .LBB1_2: ; %bb23
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; GFX9-NEXT:    v_add_u32_e32 v18, v9, v0
 ; GFX9-NEXT:    v_add_u32_e32 v12, v17, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_madak_f32 v3, v3, v7, 0x3727c5ac
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_u32_u24_e32 v18, v3, v5
-; GFX9-NEXT:    v_add_u32_e32 v19, v3, v16
-; GFX9-NEXT:    v_add_u32_e32 v3, v9, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v18
-; GFX9-NEXT:    v_sub_u32_e32 v12, v12, v18
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v19, v13
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4]
+; GFX9-NEXT:    v_mul_u32_u24_e32 v19, v3, v5
+; GFX9-NEXT:    v_add_u32_e32 v20, v3, v16
+; GFX9-NEXT:    v_sub_u32_e32 v3, v18, v19
+; GFX9-NEXT:    v_sub_u32_e32 v12, v12, v19
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4]
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v20, v13
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[6:7], v12, v14
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, v18, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[18:19], 2, v[3:4]
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add_co_u32_e64 v18, s[6:7], v10, v18
 ; GFX9-NEXT:    v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7]
 ; GFX9-NEXT:    global_load_dword v3, v[18:19], off

diff  --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index 600118386fe32..3527329d1ee3b 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -15,7 +15,7 @@
 ; GFX1010:    ; Occupancy: 20
 ; GFX1030:    ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @max_occupancy() {
+define amdgpu_kernel void @max_occupancy() #10 {
   ret void
 }
 
@@ -52,7 +52,7 @@ define amdgpu_kernel void @limited_occupancy_19() #2 {
 ; GFX1010:    ; Occupancy: 20
 ; GFX1030:    ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_24_vgprs() {
+define amdgpu_kernel void @used_24_vgprs() #10 {
   call void asm sideeffect "", "~{v23}" ()
   ret void
 }
@@ -63,7 +63,7 @@ define amdgpu_kernel void @used_24_vgprs() {
 ; GFX1010W32: ; Occupancy: 20
 ; GFX1030:    ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_28_vgprs() {
+define amdgpu_kernel void @used_28_vgprs() #10 {
   call void asm sideeffect "", "~{v27}" ()
   ret void
 }
@@ -74,7 +74,7 @@ define amdgpu_kernel void @used_28_vgprs() {
 ; GFX1010W32: ; Occupancy: 20
 ; GFX1030W32: ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_32_vgprs() {
+define amdgpu_kernel void @used_32_vgprs() #10 {
   call void asm sideeffect "", "~{v31}" ()
   ret void
 }
@@ -86,7 +86,7 @@ define amdgpu_kernel void @used_32_vgprs() {
 ; GFX1030W64: ; Occupancy: 12
 ; GFX1030W32: ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_36_vgprs() {
+define amdgpu_kernel void @used_36_vgprs() #10 {
   call void asm sideeffect "", "~{v35}" ()
   ret void
 }
@@ -97,7 +97,7 @@ define amdgpu_kernel void @used_36_vgprs() {
 ; GFX1010W32: ; Occupancy: 20
 ; GFX1030W32: ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_40_vgprs() {
+define amdgpu_kernel void @used_40_vgprs() #10 {
   call void asm sideeffect "", "~{v39}" ()
   ret void
 }
@@ -109,7 +109,7 @@ define amdgpu_kernel void @used_40_vgprs() {
 ; GFX1030W64: ; Occupancy: 10
 ; GFX1030W32: ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_44_vgprs() {
+define amdgpu_kernel void @used_44_vgprs() #10 {
   call void asm sideeffect "", "~{v43}" ()
   ret void
 }
@@ -120,7 +120,7 @@ define amdgpu_kernel void @used_44_vgprs() {
 ; GFX1010W32: ; Occupancy: 20
 ; GFX1030W32: ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_48_vgprs() {
+define amdgpu_kernel void @used_48_vgprs() #10 {
   call void asm sideeffect "", "~{v47}" ()
   ret void
 }
@@ -132,7 +132,7 @@ define amdgpu_kernel void @used_48_vgprs() {
 ; GFX1030W32: ; Occupancy: 16
 ; GFX1100W64: ; Occupancy: 12
 ; GFX1100W32: ; Occupancy: 16
-define amdgpu_kernel void @used_56_vgprs() {
+define amdgpu_kernel void @used_56_vgprs() #10 {
   call void asm sideeffect "", "~{v55}" ()
   ret void
 }
@@ -143,7 +143,7 @@ define amdgpu_kernel void @used_56_vgprs() {
 ; GFX10W32:   ; Occupancy: 16
 ; GFX1100W64: ; Occupancy: 10
 ; GFX1100W32: ; Occupancy: 16
-define amdgpu_kernel void @used_64_vgprs() {
+define amdgpu_kernel void @used_64_vgprs() #10 {
   call void asm sideeffect "", "~{v63}" ()
   ret void
 }
@@ -155,7 +155,7 @@ define amdgpu_kernel void @used_64_vgprs() {
 ; GFX1030W32: ; Occupancy: 12
 ; GFX1100W64: ; Occupancy: 10
 ; GFX1100W32: ; Occupancy: 16
-define amdgpu_kernel void @used_72_vgprs() {
+define amdgpu_kernel void @used_72_vgprs() #10 {
   call void asm sideeffect "", "~{v71}" ()
   ret void
 }
@@ -166,7 +166,7 @@ define amdgpu_kernel void @used_72_vgprs() {
 ; GFX10W32:   ; Occupancy: 12
 ; GFX1100W64: ; Occupancy: 9
 ; GFX1100W32: ; Occupancy: 16
-define amdgpu_kernel void @used_80_vgprs() {
+define amdgpu_kernel void @used_80_vgprs() #10 {
   call void asm sideeffect "", "~{v79}" ()
   ret void
 }
@@ -179,7 +179,7 @@ define amdgpu_kernel void @used_80_vgprs() {
 ; GFX1030W32: ; Occupancy: 10
 ; GFX1100W64: ; Occupancy: 9
 ; GFX1100W32: ; Occupancy: 16
-define amdgpu_kernel void @used_84_vgprs() {
+define amdgpu_kernel void @used_84_vgprs() #10 {
   call void asm sideeffect "", "~{v83}" ()
   ret void
 }
@@ -191,7 +191,7 @@ define amdgpu_kernel void @used_84_vgprs() {
 ; GFX1030W32: ; Occupancy: 10
 ; GFX1100W64: ; Occupancy: 8
 ; GFX1100W32: ; Occupancy: 16
-define amdgpu_kernel void @used_88_vgprs() {
+define amdgpu_kernel void @used_88_vgprs() #10 {
   call void asm sideeffect "", "~{v87}" ()
   ret void
 }
@@ -202,7 +202,7 @@ define amdgpu_kernel void @used_88_vgprs() {
 ; GFX10W32:   ; Occupancy: 10
 ; GFX1100W64: ; Occupancy: 8
 ; GFX1100W32: ; Occupancy: 16
-define amdgpu_kernel void @used_96_vgprs() {
+define amdgpu_kernel void @used_96_vgprs() #10 {
   call void asm sideeffect "", "~{v95}" ()
   ret void
 }
@@ -214,7 +214,7 @@ define amdgpu_kernel void @used_96_vgprs() {
 ; GFX10W32:   ; Occupancy: 9
 ; GFX1100W64: ; Occupancy: 7
 ; GFX1100W32: ; Occupancy: 12
-define amdgpu_kernel void @used_100_vgprs() {
+define amdgpu_kernel void @used_100_vgprs() #10 {
   call void asm sideeffect "", "~{v99}" ()
   ret void
 }
@@ -225,7 +225,7 @@ define amdgpu_kernel void @used_100_vgprs() {
 ; GFX10W32:   ; Occupancy: 9
 ; GFX1100W64: ; Occupancy: 6
 ; GFX1100W32: ; Occupancy: 12
-define amdgpu_kernel void @used_112_vgprs() {
+define amdgpu_kernel void @used_112_vgprs() #10 {
   call void asm sideeffect "", "~{v111}" ()
   ret void
 }
@@ -236,7 +236,7 @@ define amdgpu_kernel void @used_112_vgprs() {
 ; GFX10W32:   ; Occupancy: 8
 ; GFX1100W64: ; Occupancy: 5
 ; GFX1100W32: ; Occupancy: 10
-define amdgpu_kernel void @used_128_vgprs() {
+define amdgpu_kernel void @used_128_vgprs() #10 {
   call void asm sideeffect "", "~{v127}" ()
   ret void
 }
@@ -247,7 +247,7 @@ define amdgpu_kernel void @used_128_vgprs() {
 ; GFX10W32:   ; Occupancy: 7
 ; GFX1100W64: ; Occupancy: 5
 ; GFX1100W32: ; Occupancy: 10
-define amdgpu_kernel void @used_144_vgprs() {
+define amdgpu_kernel void @used_144_vgprs() #10 {
   call void asm sideeffect "", "~{v143}" ()
   ret void
 }
@@ -259,7 +259,7 @@ define amdgpu_kernel void @used_144_vgprs() {
 ; GFX1030W32: ; Occupancy: 5
 ; GFX1100W64: ; Occupancy: 4
 ; GFX1100W32: ; Occupancy: 9
-define amdgpu_kernel void @used_168_vgprs() {
+define amdgpu_kernel void @used_168_vgprs() #10 {
   call void asm sideeffect "", "~{v167}" ()
   ret void
 }
@@ -271,7 +271,7 @@ define amdgpu_kernel void @used_168_vgprs() {
 ; GFX1030W32: ; Occupancy: 4
 ; GFX1100W64: ; Occupancy: 3
 ; GFX1100W32: ; Occupancy: 7
-define amdgpu_kernel void @used_200_vgprs() {
+define amdgpu_kernel void @used_200_vgprs() #10 {
   call void asm sideeffect "", "~{v199}" ()
   ret void
 }
@@ -282,7 +282,7 @@ define amdgpu_kernel void @used_200_vgprs() {
 ; GFX10W32:   ; Occupancy: 4
 ; GFX1100W64: ; Occupancy: 2
 ; GFX1100W32: ; Occupancy: 5
-define amdgpu_kernel void @used_256_vgprs() {
+define amdgpu_kernel void @used_256_vgprs() #10 {
   call void asm sideeffect "", "~{v255}" ()
   ret void
 }
@@ -292,7 +292,7 @@ define amdgpu_kernel void @used_256_vgprs() {
 ; GFX1010:    ; Occupancy: 20
 ; GFX1030:    ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_80_sgprs() {
+define amdgpu_kernel void @used_80_sgprs() #10 {
   call void asm sideeffect "", "~{s79}" ()
   ret void
 }
@@ -302,7 +302,7 @@ define amdgpu_kernel void @used_80_sgprs() {
 ; GFX1010:    ; Occupancy: 20
 ; GFX1030:    ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_88_sgprs() {
+define amdgpu_kernel void @used_88_sgprs() #10 {
   call void asm sideeffect "", "~{s87}" ()
   ret void
 }
@@ -312,7 +312,7 @@ define amdgpu_kernel void @used_88_sgprs() {
 ; GFX1010:    ; Occupancy: 20
 ; GFX1030:    ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_100_sgprs() {
+define amdgpu_kernel void @used_100_sgprs() #10 {
   call void asm sideeffect "", "~{s99}" ()
   ret void
 }
@@ -322,15 +322,16 @@ define amdgpu_kernel void @used_100_sgprs() {
 ; GFX1010:    ; Occupancy: 20
 ; GFX1030:    ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
-define amdgpu_kernel void @used_101_sgprs() {
+define amdgpu_kernel void @used_101_sgprs() #10 {
   call void asm sideeffect "", "~{s100}" ()
   ret void
 }
 
 ; GCN-LABEL: {{^}}used_lds_6552:
-; GFX9:       ; Occupancy: 10
-; GFX1010:    ; Occupancy: 20
-; GFX1030:    ; Occupancy: 16
+; GFX9:       ; Occupancy: 8
+; GFX1010W64: ; Occupancy: 20
+; GFX1030W64: ; Occupancy: 16
+; GFX10W32:   ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
 @lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_6552() {
@@ -339,9 +340,10 @@ define amdgpu_kernel void @used_lds_6552() {
 }
 
 ; GCN-LABEL: {{^}}used_lds_6556:
-; GFX9:       ; Occupancy: 10
-; GFX1010:    ; Occupancy: 20
-; GFX1030:    ; Occupancy: 16
+; GFX9:       ; Occupancy: 8
+; GFX1010W64: ; Occupancy: 20
+; GFX1030W64: ; Occupancy: 16
+; GFX10W32:   ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
 @lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_6556() {
@@ -350,9 +352,10 @@ define amdgpu_kernel void @used_lds_6556() {
 }
 
 ; GCN-LABEL: {{^}}used_lds_13112:
-; GFX9:       ; Occupancy: 10
-; GFX1010:    ; Occupancy: 20
-; GFX1030:    ; Occupancy: 16
+; GFX9:       ; Occupancy: 8
+; GFX1010W64: ; Occupancy: 20
+; GFX1030W64: ; Occupancy: 16
+; GFX10W32:   ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
 @lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_13112() {
@@ -361,11 +364,11 @@ define amdgpu_kernel void @used_lds_13112() {
 }
 
 ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64:
-; GFX9:       ; Occupancy: 7{{$}}
-; GFX10W64:   ; Occupancy: 7{{$}}
-; GFX10W32:   ; Occupancy: 14{{$}}
-; GFX1100W64: ; Occupancy: 7{{$}}
-; GFX1100W32: ; Occupancy: 14{{$}}
+; GFX9:       ; Occupancy: 2{{$}}
+; GFX10W64:   ; Occupancy: 4{{$}}
+; GFX10W32:   ; Occupancy: 8{{$}}
+; GFX1100W64: ; Occupancy: 4{{$}}
+; GFX1100W32: ; Occupancy: 8{{$}}
 @lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
   store volatile i8 1, ptr addrspace(3) @lds8252
@@ -373,44 +376,46 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
 }
 
 ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96:
-; GFX9:       ; Occupancy: 10{{$}}
-; GFX10W64:   ; Occupancy: 14{{$}}
-; GFX1010W32: ; Occupancy: 20{{$}}
-; GFX1030W32: ; Occupancy: 16{{$}}
-; GFX1100W64: ; Occupancy: 14{{$}}
-; GFX1100W32: ; Occupancy: 16{{$}}
+; GFX9:       ; Occupancy: 4{{$}}
+; GFX10W64:   ; Occupancy: 8{{$}}
+; GFX10W32:   ; Occupancy: 12{{$}}
+; GFX1100W64: ; Occupancy: 8{{$}}
+; GFX1100W32: ; Occupancy: 12{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
   store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
 ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128:
-; GFX9:       ; Occupancy: 10{{$}}
-; GFX10W64:   ; Occupancy: 14{{$}}
-; GFX1010W32: ; Occupancy: 20{{$}}
-; GFX1030W32: ; Occupancy: 16{{$}}
-; GFX1100W64: ; Occupancy: 14{{$}}
-; GFX1100W32: ; Occupancy: 16{{$}}
+; GFX9:       ; Occupancy: 4{{$}}
+; GFX10W64:   ; Occupancy: 8{{$}}
+; GFX10W32:   ; Occupancy: 15{{$}}
+; GFX1100W64: ; Occupancy: 8{{$}}
+; GFX1100W32: ; Occupancy: 15{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
   store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
 ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192:
-; GFX9:       ; Occupancy: 10{{$}}
-; GFX1010:    ; Occupancy: 20{{$}}
-; GFX1030:    ; Occupancy: 16{{$}}
-; GFX1100:    ; Occupancy: 16{{$}}
+; GFX9:       ; Occupancy: 6{{$}}
+; GFX10W64:   ; Occupancy: 12{{$}}
+; GFX1010W32: ; Occupancy: 20{{$}}
+; GFX1030W32: ; Occupancy: 15{{$}}
+; GFX1100W64: ; Occupancy: 12{{$}}
+; GFX1100W32: ; Occupancy: 15{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
   store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
 ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256:
-; GFX9:       ; Occupancy: 10{{$}}
-; GFX1010:    ; Occupancy: 20{{$}}
-; GFX1030:    ; Occupancy: 16{{$}}
-; GFX1100:    ; Occupancy: 16{{$}}
+; GFX9:       ; Occupancy: 7{{$}}
+; GFX10W64:   ; Occupancy: 15{{$}}
+; GFX1010W32: ; Occupancy: 20{{$}}
+; GFX1030W32: ; Occupancy: 16{{$}}
+; GFX1100W64: ; Occupancy: 15{{$}}
+; GFX1100W32: ; Occupancy: 16{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
   store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
@@ -427,8 +432,9 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
 }
 
 ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024:
-; GFX9:       ; Occupancy: 10{{$}}
-; GFX1010:    ; Occupancy: 20{{$}}
+; GFX9:       ; Occupancy: 8{{$}}
+; GFX1010W32: ; Occupancy: 16{{$}}
+; GFX1010W64: ; Occupancy: 20{{$}}
 ; GFX1030:    ; Occupancy: 16{{$}}
 ; GFX1100:    ; Occupancy: 16{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
@@ -437,17 +443,17 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
 }
 
 ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
-; GFX9:       ; Occupancy: 7{{$}}
-; GFX10:      ; Occupancy: 7{{$}}
-; GFX1100:    ; Occupancy: 7{{$}}
+; GFX9:       ; Occupancy: 2{{$}}
+; GFX10:      ; Occupancy: 4{{$}}
+; GFX1100:    ; Occupancy: 4{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
   store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
 attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" }
-attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
-attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
+attributes #1 = { "amdgpu-waves-per-eu"="18,18" "amdgpu-flat-work-group-size"="1,32" }
+attributes #2 = { "amdgpu-waves-per-eu"="19,19" "amdgpu-flat-work-group-size"="1,32" }
 attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
 attributes #4 = { "amdgpu-flat-work-group-size"="1,96" }
 attributes #5 = { "amdgpu-flat-work-group-size"="1,128" }

diff  --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index c8d1275f56c96..ce1a9ad58f011 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -5,7 +5,7 @@
 
 # GCN-LABEL: name: global_sextload_v32i32_to_v32i64
 # GCN: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr24_vgpr25_vgpr26_vgpr27, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0
+# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0
 
 ---
 name:            global_sextload_v32i32_to_v32i64

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 271991b983244..8fd73ed68b24e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -446,87 +446,87 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v2, v3, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x5000
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_movk_i32 s0, 0x7f
 ; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:  .LBB1_2: ; %for.body
 ; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0xffffb000, v3
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, -1, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0xffffb000, v5
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0xffffb800, v5
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0xffffc000, v5
 ; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0xffffb800, v3
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, -1, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0xffffc000, v3
-; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, -1, v4, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0xffffc800, v5
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0xffffd000, v5
 ; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[11:12]
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0xffffc800, v3
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, -1, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0xffffd000, v3
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, -1, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xffffd800, v3
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, -1, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 0xffffe000, v3
-; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, -1, v4, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xffffd800, v5
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, -1, v6, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
 ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 0xffffe000, v5
+; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 0xffffe800, v5
+; GFX8-NEXT:    flat_load_dwordx2 v[19:20], v[19:20]
+; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, -1, v6, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[21:22], v[21:22]
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 0xfffff000, v5
+; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, -1, v6, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[23:24], v[23:24]
+; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 0xfffff800, v5
+; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, -1, v6, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[25:26], v[25:26]
+; GFX8-NEXT:    flat_load_dwordx2 v[27:28], v[5:6]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x10000, v5
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX8-NEXT:    s_addk_i32 s1, 0x2000
 ; GFX8-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
-; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v7, v5
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, v8, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0xffffe800, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0xfffff000, v3
-; GFX8-NEXT:    flat_load_dwordx2 v[19:20], v[19:20]
-; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, -1, v4, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v9, v21
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, v10, v22, vcc
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0xfffff800, v3
-; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, -1, v4, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v11, v21
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, v12, v22, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x10000, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(10)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(9)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(8)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v11, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v21
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, v14, v22, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v13, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v15, v13
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, v16, v14, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v15, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v16, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v17, v13
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, v18, v14, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v17, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v18, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v19, v13
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, v20, v14, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v19, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v20, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v13
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v6, v14, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v21, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v22, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v8, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v23, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v24, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v9, v5
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v10, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v25, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v26, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v11, v5
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v12, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v27, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v28, v4, vcc
 ; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
@@ -540,7 +540,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: clmem_read:
@@ -574,92 +574,91 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
 ; GFX900-NEXT:    s_movk_i32 s0, 0x5000
 ; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX900-NEXT:    s_movk_i32 s4, 0x7f
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
-; GFX900-NEXT:    s_movk_i32 s2, 0xd000
-; GFX900-NEXT:    s_movk_i32 s3, 0xe000
-; GFX900-NEXT:    s_movk_i32 s5, 0xf000
+; GFX900-NEXT:    s_movk_i32 s2, 0x7f
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    s_movk_i32 s0, 0xd000
+; GFX900-NEXT:    s_movk_i32 s1, 0xe000
+; GFX900-NEXT:    s_movk_i32 s3, 0xf000
 ; GFX900-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX900-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX900-NEXT:    ; Child Loop BB1_2 Depth 2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v3, v1
-; GFX900-NEXT:    s_mov_b32 s6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v6, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, v1
+; GFX900-NEXT:    s_mov_b32 s4, 0
 ; GFX900-NEXT:  .LBB1_2: ; %for.body
 ; GFX900-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, 0xffffb000, v3
-; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, -1, v4, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[3:4], off offset:-4096
-; GFX900-NEXT:    global_load_dwordx2 v[11:12], v[3:4], off offset:-2048
-; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, 0xffffc000, v3
+; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, 0xffffb000, v5
+; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, -1, v6, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[5:6], off offset:-4096
+; GFX900-NEXT:    global_load_dwordx2 v[11:12], v[5:6], off offset:-2048
+; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, 0xffffc000, v5
 ; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[7:8], off
-; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v4, vcc
+; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v6, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[19:20], v[13:14], off
-; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, s2, v3
-; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, -1, v4, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, s3, v3
+; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, s0, v5
+; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, -1, v6, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[15:16], v[15:16], off offset:-2048
-; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v4, vcc
-; GFX900-NEXT:    s_addk_i32 s6, 0x2000
-; GFX900-NEXT:    s_cmp_gt_u32 s6, 0x3fffff
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e32 v21, vcc, v7, v5
-; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[13:14], off offset:-4096
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e64 v23, s[0:1], v17, v21
-; GFX900-NEXT:    v_addc_co_u32_e64 v24, s[0:1], v18, v6, s[0:1]
-; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[21:22], v[13:14], off
-; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, s5, v3
-; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, -1, v4, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[5:6], v[5:6], off offset:-2048
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v23
-; GFX900-NEXT:    global_load_dwordx2 v[13:14], v[3:4], off
-; GFX900-NEXT:    v_addc_co_u32_e32 v20, vcc, v20, v24, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, 0x10000, v3
-; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v19, vcc, s1, v5
+; GFX900-NEXT:    global_load_dwordx2 v[13:14], v[13:14], off
+; GFX900-NEXT:    v_addc_co_u32_e32 v20, vcc, -1, v6, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[23:24], v[19:20], off offset:-4096
+; GFX900-NEXT:    global_load_dwordx2 v[25:26], v[19:20], off offset:-2048
+; GFX900-NEXT:    global_load_dwordx2 v[27:28], v[19:20], off
+; GFX900-NEXT:    v_add_co_u32_e32 v21, vcc, s3, v5
+; GFX900-NEXT:    v_addc_co_u32_e32 v22, vcc, -1, v6, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[19:20], v[21:22], off offset:-2048
+; GFX900-NEXT:    global_load_dwordx2 v[29:30], v[5:6], off
+; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, 0x10000, v5
+; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX900-NEXT:    s_addk_i32 s4, 0x2000
+; GFX900-NEXT:    s_cmp_gt_u32 s4, 0x3fffff
+; GFX900-NEXT:    s_waitcnt vmcnt(8)
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(7)
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v17, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v18, v4, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, v15, v19
-; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, v16, v20, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v13, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v4, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v15, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v16, v4, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v15
-; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v16, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v23, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v24, v4, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v17, v7
-; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v18, v8, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v25, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v26, v4, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v21, v7
-; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v22, v8, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v27, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v28, v4, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v8, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, v9, v5
-; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, v10, v6, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, v11, v5
-; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, v12, v6, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v19, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v20, v4, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v9, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v4, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v11, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v12, v4, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, v13, v5
-; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, v14, v6, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v29, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v30, v4, vcc
 ; GFX900-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX900-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX900-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX900-NEXT:    s_add_i32 s0, s4, -1
-; GFX900-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX900-NEXT:    s_add_i32 s4, s2, -1
+; GFX900-NEXT:    s_cmp_eq_u32 s2, 0
 ; GFX900-NEXT:    s_cbranch_scc1 .LBB1_5
 ; GFX900-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX900-NEXT:    s_mov_b32 s4, s0
+; GFX900-NEXT:    s_mov_b32 s2, s4
 ; GFX900-NEXT:    s_branch .LBB1_1
 ; GFX900-NEXT:  .LBB1_5: ; %while.end
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[3:4], off
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: clmem_read:

diff  --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index a67691f42bf27..50a1d48b71304 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -124,7 +124,7 @@ define void @test_func() !dbg !6 {
 ; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0
-; STDERR-NEXT: remark: foo.cl:8:0:     Occupancy [waves/SIMD]: 10
+; STDERR-NEXT: remark: foo.cl:8:0:     Occupancy [waves/SIMD]: 8
 ; STDERR-NEXT: remark: foo.cl:8:0:     SGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     LDS Size [bytes/block]: 0

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index c38665db7b861..cf927b1c0e6fc 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -42,7 +42,7 @@ body:             |
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def dead %11
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
   ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def %15, 851978 /* regdef:VGPR_HI16 */, def %16
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
@@ -50,8 +50,8 @@ body:             |
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
   ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def %21, 851978 /* regdef:VGPR_HI16 */, def %22
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def dead [[V_MOV_B32_e32_2]], 851978 /* regdef:VGPR_HI16 */, def dead [[V_MOV_B32_e32_3]], 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_3]](tied-def 5), 851977 /* reguse:VGPR_HI16 */, %15, 851977 /* reguse:VGPR_HI16 */, %16, 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B32_gfx9_2]]
-  ; CHECK-NEXT:   %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3)

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
index 491e37ad13bb9..8633219ae237c 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
@@ -16,18 +16,18 @@ body: |
     ; CHECK-NEXT: undef %0.sub3:vreg_128 = COPY $vgpr9
     ; CHECK-NEXT: undef %1.sub2:vreg_128 = COPY $vgpr8
     ; CHECK-NEXT: undef %2.sub1:vreg_128 = COPY $vgpr7
-    ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY $vgpr1
-    ; CHECK-NEXT: %8.sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef %3.sub0:vreg_128 = COPY $vgpr6
     ; CHECK-NEXT: undef %4.sub3:vreg_128 = COPY $vgpr5
     ; CHECK-NEXT: undef %5.sub2:vreg_128 = COPY $vgpr4
+    ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: %8.sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef %6.sub1:vreg_128 = COPY $vgpr3
     ; CHECK-NEXT: undef %7.sub0:vreg_128 = COPY $vgpr2
     ; CHECK-NEXT: undef %9.sub0:sgpr_128 = V_READFIRSTLANE_B32 %7.sub0, implicit $exec
     ; CHECK-NEXT: %9.sub1:sgpr_128 = V_READFIRSTLANE_B32 %6.sub1, implicit $exec
+    ; CHECK-NEXT: S_BARRIER
     ; CHECK-NEXT: %9.sub2:sgpr_128 = V_READFIRSTLANE_B32 %5.sub2, implicit $exec
     ; CHECK-NEXT: %9.sub3:sgpr_128 = V_READFIRSTLANE_B32 %4.sub3, implicit $exec
-    ; CHECK-NEXT: S_BARRIER
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %9, 0, 0, 0, 0, implicit $exec
     ; CHECK-NEXT: undef %12.sub0:sgpr_128 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
     ; CHECK-NEXT: %12.sub1:sgpr_128 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
index 5a081a75b6b94..d71a3f9276040 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
@@ -3,8 +3,7 @@
 
 ; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
 ; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
-; which currently looks to the scheduler like an occupancy reduction, even
-; though it's not. TODO: Fix!
+; which (incorrectly) used to look to the scheduler like an occupancy reduction.
 
 ; 6 kB of LDS, allows 10 workgroups
 @lds = internal addrspace(3) global [384 x <4 x i32>] undef
@@ -20,7 +19,7 @@ define internal void @copy(ptr addrspace(1) %src, i32 %ofs) alwaysinline {
 define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"="32,32" {
 ; CHECK-LABEL: test:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_clause 0xa
+; CHECK-NEXT:    s_clause 0x1f
 ; CHECK-NEXT:    global_load_b128 v[2:5], v[0:1], off
 ; CHECK-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:16
 ; CHECK-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:32
@@ -32,81 +31,92 @@ define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"
 ; CHECK-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:128
 ; CHECK-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:144
 ; CHECK-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:160
-; CHECK-NEXT:    v_mov_b32_e32 v86, 0
-; CHECK-NEXT:    s_clause 0x8
 ; CHECK-NEXT:    global_load_b128 v[46:49], v[0:1], off offset:176
-; CHECK-NEXT:    global_load_b128 v[50:53], v[0:1], off offset:240
-; CHECK-NEXT:    global_load_b128 v[54:57], v[0:1], off offset:224
-; CHECK-NEXT:    global_load_b128 v[58:61], v[0:1], off offset:208
-; CHECK-NEXT:    global_load_b128 v[62:65], v[0:1], off offset:192
-; CHECK-NEXT:    global_load_b128 v[66:69], v[0:1], off offset:304
-; CHECK-NEXT:    global_load_b128 v[70:73], v[0:1], off offset:288
-; CHECK-NEXT:    global_load_b128 v[74:77], v[0:1], off offset:272
-; CHECK-NEXT:    global_load_b128 v[78:81], v[0:1], off offset:256
+; CHECK-NEXT:    global_load_b128 v[50:53], v[0:1], off offset:192
+; CHECK-NEXT:    global_load_b128 v[54:57], v[0:1], off offset:208
+; CHECK-NEXT:    global_load_b128 v[58:61], v[0:1], off offset:224
+; CHECK-NEXT:    global_load_b128 v[62:65], v[0:1], off offset:240
+; CHECK-NEXT:    global_load_b128 v[66:69], v[0:1], off offset:256
+; CHECK-NEXT:    global_load_b128 v[70:73], v[0:1], off offset:272
+; CHECK-NEXT:    global_load_b128 v[74:77], v[0:1], off offset:288
+; CHECK-NEXT:    global_load_b128 v[78:81], v[0:1], off offset:304
+; CHECK-NEXT:    global_load_b128 v[82:85], v[0:1], off offset:320
+; CHECK-NEXT:    global_load_b128 v[86:89], v[0:1], off offset:336
+; CHECK-NEXT:    global_load_b128 v[90:93], v[0:1], off offset:352
+; CHECK-NEXT:    global_load_b128 v[94:97], v[0:1], off offset:368
+; CHECK-NEXT:    global_load_b128 v[98:101], v[0:1], off offset:384
+; CHECK-NEXT:    global_load_b128 v[102:105], v[0:1], off offset:400
+; CHECK-NEXT:    global_load_b128 v[106:109], v[0:1], off offset:416
+; CHECK-NEXT:    global_load_b128 v[110:113], v[0:1], off offset:432
+; CHECK-NEXT:    global_load_b128 v[114:117], v[0:1], off offset:448
+; CHECK-NEXT:    global_load_b128 v[118:121], v[0:1], off offset:464
+; CHECK-NEXT:    global_load_b128 v[122:125], v[0:1], off offset:480
+; CHECK-NEXT:    global_load_b128 v[126:129], v[0:1], off offset:496
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    ds_store_b128 v0, v[2:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    ds_store_b128 v0, v[6:9] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    ds_store_b128 v0, v[10:13] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    ds_store_b128 v0, v[14:17] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    ds_store_b128 v0, v[18:21] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    ds_store_b128 v0, v[22:25] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    ds_store_b128 v0, v[26:29] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    ds_store_b128 v0, v[30:33] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    ds_store_b128 v0, v[34:37] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    ds_store_b128 v0, v[38:41] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    ds_store_b128 v0, v[42:45] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    ds_store_b128 v0, v[46:49] offset:176
 ; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    ds_store_b128 v86, v[2:5]
+; CHECK-NEXT:    ds_store_b128 v0, v[50:53] offset:192
 ; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    ds_store_b128 v86, v[6:9] offset:16
+; CHECK-NEXT:    ds_store_b128 v0, v[54:57] offset:208
 ; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    ds_store_b128 v86, v[10:13] offset:32
+; CHECK-NEXT:    ds_store_b128 v0, v[58:61] offset:224
 ; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    ds_store_b128 v86, v[14:17] offset:48
+; CHECK-NEXT:    ds_store_b128 v0, v[62:65] offset:240
 ; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    ds_store_b128 v86, v[18:21] offset:64
+; CHECK-NEXT:    ds_store_b128 v0, v[66:69] offset:256
 ; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    ds_store_b128 v86, v[22:25] offset:80
+; CHECK-NEXT:    ds_store_b128 v0, v[70:73] offset:272
 ; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    ds_store_b128 v86, v[26:29] offset:96
+; CHECK-NEXT:    ds_store_b128 v0, v[74:77] offset:288
 ; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    ds_store_b128 v86, v[30:33] offset:112
+; CHECK-NEXT:    ds_store_b128 v0, v[78:81] offset:304
 ; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    ds_store_b128 v86, v[34:37] offset:128
+; CHECK-NEXT:    ds_store_b128 v0, v[82:85] offset:320
 ; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    ds_store_b128 v86, v[38:41] offset:144
+; CHECK-NEXT:    ds_store_b128 v0, v[86:89] offset:336
 ; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    ds_store_b128 v86, v[42:45] offset:160
-; CHECK-NEXT:    s_clause 0xb
-; CHECK-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:368
-; CHECK-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:352
-; CHECK-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:336
-; CHECK-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:320
-; CHECK-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:432
-; CHECK-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:416
-; CHECK-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:400
-; CHECK-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:384
-; CHECK-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:464
-; CHECK-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:448
-; CHECK-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:480
-; CHECK-NEXT:    global_load_b128 v[82:85], v[0:1], off offset:496
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    ds_store_b128 v86, v[46:49] offset:176
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    ds_store_b128 v86, v[62:65] offset:192
-; CHECK-NEXT:    ds_store_b128 v86, v[58:61] offset:208
-; CHECK-NEXT:    ds_store_b128 v86, v[54:57] offset:224
-; CHECK-NEXT:    ds_store_b128 v86, v[50:53] offset:240
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    ds_store_b128 v86, v[78:81] offset:256
-; CHECK-NEXT:    ds_store_b128 v86, v[74:77] offset:272
-; CHECK-NEXT:    ds_store_b128 v86, v[70:73] offset:288
-; CHECK-NEXT:    ds_store_b128 v86, v[66:69] offset:304
+; CHECK-NEXT:    ds_store_b128 v0, v[90:93] offset:352
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    ds_store_b128 v86, v[14:17] offset:320
-; CHECK-NEXT:    ds_store_b128 v86, v[10:13] offset:336
-; CHECK-NEXT:    ds_store_b128 v86, v[6:9] offset:352
-; CHECK-NEXT:    ds_store_b128 v86, v[2:5] offset:368
+; CHECK-NEXT:    ds_store_b128 v0, v[94:97] offset:368
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    ds_store_b128 v0, v[98:101] offset:384
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    ds_store_b128 v0, v[102:105] offset:400
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_store_b128 v0, v[106:109] offset:416
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    ds_store_b128 v86, v[30:33] offset:384
-; CHECK-NEXT:    ds_store_b128 v86, v[26:29] offset:400
-; CHECK-NEXT:    ds_store_b128 v86, v[22:25] offset:416
-; CHECK-NEXT:    ds_store_b128 v86, v[18:21] offset:432
+; CHECK-NEXT:    ds_store_b128 v0, v[110:113] offset:432
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    ds_store_b128 v0, v[114:117] offset:448
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_store_b128 v86, v[38:41] offset:448
-; CHECK-NEXT:    ds_store_b128 v86, v[34:37] offset:464
+; CHECK-NEXT:    ds_store_b128 v0, v[118:121] offset:464
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_store_b128 v86, v[42:45] offset:480
+; CHECK-NEXT:    ds_store_b128 v0, v[122:125] offset:480
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_store_b128 v86, v[82:85] offset:496
+; CHECK-NEXT:    ds_store_b128 v0, v[126:129] offset:496
 ; CHECK-NEXT:    s_endpgm
   call void @copy(ptr addrspace(1) %src, i32 0)
   call void @copy(ptr addrspace(1) %src, i32 1)

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 38bf5f9fc7ff4..c9a393d7bdc67 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -822,116 +822,116 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
+; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GCN-NEXT:    v_xor_b32_e32 v4, v4, v9
+; GCN-NEXT:    v_xor_b32_e32 v5, v5, v11
+; GCN-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
 ; GCN-NEXT:    v_xor_b32_e32 v15, v8, v9
+; GCN-NEXT:    v_xor_b32_e32 v16, v10, v11
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
-; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; GCN-NEXT:    v_xor_b32_e32 v6, v6, v13
+; GCN-NEXT:    v_xor_b32_e32 v17, v12, v13
+; GCN-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v6
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
 ; GCN-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
+; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 0, v5
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v8
-; GCN-NEXT:    v_xor_b32_e32 v5, v5, v11
-; GCN-NEXT:    v_xor_b32_e32 v6, v6, v13
-; GCN-NEXT:    v_xor_b32_e32 v16, v10, v11
+; GCN-NEXT:    v_mul_lo_u32 v11, v11, v10
+; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v6
+; GCN-NEXT:    v_mul_lo_u32 v13, v13, v12
 ; GCN-NEXT:    v_mul_hi_u32 v9, v8, v9
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
-; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v11, v6
+; GCN-NEXT:    v_mul_hi_u32 v11, v10, v11
+; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GCN-NEXT:    v_mul_hi_u32 v13, v12, v13
+; GCN-NEXT:    v_xor_b32_e32 v7, v7, v14
+; GCN-NEXT:    v_cvt_f32_u32_e32 v18, v7
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v11, v11
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, v11, v10
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GCN-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
-; GCN-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v10
-; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v11
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v9
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, v12, v13
+; GCN-NEXT:    v_mul_hi_u32 v10, v2, v10
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v18, v18
 ; GCN-NEXT:    v_mul_lo_u32 v11, v8, v4
-; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; GCN-NEXT:    v_xor_b32_e32 v17, v12, v13
-; GCN-NEXT:    v_xor_b32_e32 v2, v2, v12
-; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
+; GCN-NEXT:    v_mul_lo_u32 v13, v9, v5
+; GCN-NEXT:    v_mul_lo_u32 v21, v10, v6
+; GCN-NEXT:    v_mul_f32_e32 v18, 0x4f7ffffe, v18
+; GCN-NEXT:    v_cvt_u32_f32_e32 v18, v18
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
-; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; GCN-NEXT:    v_mul_lo_u32 v12, v12, v9
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v13
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 1, v9
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v21
 ; GCN-NEXT:    v_subrev_i32_e32 v11, vcc, v4, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v12, vcc, v5, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v20, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v19, vcc, 0, v7
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 1, v10
+; GCN-NEXT:    v_subrev_i32_e32 v13, vcc, v6, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0, v6
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, v10
-; GCN-NEXT:    v_xor_b32_e32 v4, v7, v14
-; GCN-NEXT:    v_mul_hi_u32 v7, v9, v12
-; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v4
-; GCN-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, v2, v0
-; GCN-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
-; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
-; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v10, v0, v10, s[4:5]
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v5, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v6, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 1, v7
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, v8, v15
-; GCN-NEXT:    v_xor_b32_e32 v5, v0, v16
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v15, v1
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v16, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v9, v12
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v9
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v19, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v8, v11, vcc
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, v18, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, v12, v5
 ; GCN-NEXT:    v_xor_b32_e32 v3, v3, v8
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v10
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GCN-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v18
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v22, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v15
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v16
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s[4:5]
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v15, v0
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v16, v1
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v10
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v5, v4, v7
 ; GCN-NEXT:    v_xor_b32_e32 v2, v2, v17
-; GCN-NEXT:    v_mul_lo_u32 v6, v5, v4
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v17, v2
-; GCN-NEXT:    v_xor_b32_e32 v7, v8, v14
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; GCN-NEXT:    v_subrev_i32_e32 v8, vcc, v4, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_xor_b32_e32 v6, v8, v14
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GCN-NEXT:    v_subrev_i32_e32 v8, vcc, v7, v3
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v7, v3
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v6, v3
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -953,116 +953,116 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
+; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v11, v5
+; TONGA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
 ; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
 ; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v9
+; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v11
+; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
 ; TONGA-NEXT:    v_xor_b32_e32 v15, v8, v9
+; TONGA-NEXT:    v_xor_b32_e32 v16, v10, v11
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v13, v6
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v4
-; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v5
+; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v12, v2
+; TONGA-NEXT:    v_xor_b32_e32 v6, v6, v13
+; TONGA-NEXT:    v_xor_b32_e32 v17, v12, v13
+; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v12
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v6
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; TONGA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v11, v5
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v4
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v12, v12
 ; TONGA-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; TONGA-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v13, v6
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; TONGA-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v12, v12
+; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v5
 ; TONGA-NEXT:    v_mul_lo_u32 v9, v9, v8
-; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v11
-; TONGA-NEXT:    v_xor_b32_e32 v6, v6, v13
-; TONGA-NEXT:    v_xor_b32_e32 v16, v10, v11
+; TONGA-NEXT:    v_mul_lo_u32 v11, v11, v10
+; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v6
+; TONGA-NEXT:    v_mul_lo_u32 v13, v13, v12
 ; TONGA-NEXT:    v_mul_hi_u32 v9, v8, v9
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v5
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v6
+; TONGA-NEXT:    v_mul_hi_u32 v11, v10, v11
+; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v14, v7
+; TONGA-NEXT:    v_mul_hi_u32 v13, v12, v13
+; TONGA-NEXT:    v_xor_b32_e32 v7, v7, v14
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v18, v7
 ; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v11, v11
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v11, v10
 ; TONGA-NEXT:    v_mul_hi_u32 v8, v0, v8
-; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
-; TONGA-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v10
-; TONGA-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v11
+; TONGA-NEXT:    v_mul_hi_u32 v9, v1, v9
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v12, v13
+; TONGA-NEXT:    v_mul_hi_u32 v10, v2, v10
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v18, v18
 ; TONGA-NEXT:    v_mul_lo_u32 v11, v8, v4
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v12, v2
-; TONGA-NEXT:    v_xor_b32_e32 v17, v12, v13
-; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v12
-; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v5
+; TONGA-NEXT:    v_mul_lo_u32 v13, v9, v5
+; TONGA-NEXT:    v_mul_lo_u32 v21, v10, v6
+; TONGA-NEXT:    v_mul_f32_e32 v18, 0x4f7ffffe, v18
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v18, v18
 ; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v11
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v9
-; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v8
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v13
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
+; TONGA-NEXT:    v_add_u32_e32 v20, vcc, 1, v9
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v21
 ; TONGA-NEXT:    v_subrev_u32_e32 v11, vcc, v4, v0
-; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
+; TONGA-NEXT:    v_subrev_u32_e32 v12, vcc, v5, v1
+; TONGA-NEXT:    v_cndmask_b32_e64 v9, v9, v20, s[2:3]
+; TONGA-NEXT:    v_sub_u32_e32 v19, vcc, 0, v7
+; TONGA-NEXT:    v_add_u32_e32 v22, vcc, 1, v10
+; TONGA-NEXT:    v_subrev_u32_e32 v13, vcc, v6, v2
 ; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v14, v7
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, 0, v6
-; TONGA-NEXT:    v_mul_lo_u32 v0, v0, v10
-; TONGA-NEXT:    v_xor_b32_e32 v4, v7, v14
-; TONGA-NEXT:    v_mul_hi_u32 v7, v9, v12
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v4
-; TONGA-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v8
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; TONGA-NEXT:    v_mul_hi_u32 v7, v1, v7
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v10, v0
-; TONGA-NEXT:    v_mul_hi_u32 v0, v2, v0
-; TONGA-NEXT:    v_mul_lo_u32 v10, v7, v5
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v4
-; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v10
-; TONGA-NEXT:    v_mul_lo_u32 v10, v0, v6
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
-; TONGA-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v10
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v7
-; TONGA-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[2:3]
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v0
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; TONGA-NEXT:    v_cndmask_b32_e64 v10, v0, v10, s[4:5]
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v5, v1
-; TONGA-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v6, v2
-; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v7
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v1, v8, v15
-; TONGA-NEXT:    v_xor_b32_e32 v5, v0, v16
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v15, v1
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v16, v5
-; TONGA-NEXT:    v_mul_lo_u32 v5, v9, v12
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v9
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; TONGA-NEXT:    v_mul_lo_u32 v4, v19, v18
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v8, v11, vcc
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; TONGA-NEXT:    v_mul_hi_u32 v4, v18, v4
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc
 ; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
-; TONGA-NEXT:    v_mul_hi_u32 v5, v12, v5
 ; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v8
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v10
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v5, v12
-; TONGA-NEXT:    v_mul_hi_u32 v5, v3, v5
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v4, v18
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v10, v10, v22, s[4:5]
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v15
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v16
+; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s[4:5]
+; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v15, v0
+; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v16, v1
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v10
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; TONGA-NEXT:    v_cndmask_b32_e32 v2, v10, v7, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v2, v10, v5, vcc
+; TONGA-NEXT:    v_mul_lo_u32 v5, v4, v7
 ; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v17
-; TONGA-NEXT:    v_mul_lo_u32 v6, v5, v4
 ; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, v17, v2
-; TONGA-NEXT:    v_xor_b32_e32 v7, v8, v14
-; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
-; TONGA-NEXT:    v_subrev_u32_e32 v8, vcc, v4, v3
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
-; TONGA-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
+; TONGA-NEXT:    v_xor_b32_e32 v6, v8, v14
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
+; TONGA-NEXT:    v_subrev_u32_e32 v8, vcc, v7, v3
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
+; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
-; TONGA-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
-; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v7, v3
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
+; TONGA-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v6
+; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
 ; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; TONGA-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 78b80ca1c5ee8..db22f2e12ca7a 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -366,96 +366,96 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v4, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v1
-; GCN-IR-NEXT:    v_sub_i32_e32 v11, vcc, v0, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v12, vcc, v1, v4, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v5, v2
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v1, v5
-; GCN-IR-NEXT:    v_subb_u32_e32 v3, vcc, v0, v5, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[11:12]
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v2
-; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v3
-; GCN-IR-NEXT:    v_min_u32_e32 v0, v0, v7
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v11
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v8, v12
-; GCN-IR-NEXT:    v_min_u32_e32 v13, v7, v8
-; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v0, v13
-; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[7:8]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[7:8]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
+; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v0, v4
+; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v1, v4, vcc
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v5, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v2
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    v_min_u32_e32 v12, v2, v3
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v10
+; GCN-IR-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v2
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v11
+; GCN-IR-NEXT:    v_min_u32_e32 v13, v2, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[6:7], v12, v13
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v10, v12, 0, s[6:7]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v11, 0, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v5
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v11, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v8, v10, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v7
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], 63, v7
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[11:12], v7
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v18, vcc, -1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v19, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v0, v0
-; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[11:12], v14
-; GCN-IR-NEXT:    v_not_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, v0, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v9, v12
+; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[10:11], v14
+; GCN-IR-NEXT:    v_not_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v9, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 31, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v0, v14, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v18, v0
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, v19, v15, vcc
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
-; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, 1, v11
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v16, v7
-; GCN-IR-NEXT:    v_and_b32_e32 v9, 1, v13
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v13, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v13, v13, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, 0, v12, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[11:12]
-; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v0, v13
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v17, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v17, v10
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v14, v14, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v16, v14
+; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v17, v15, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v10
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v14, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[4:5], v15, v13, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v9
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v16, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v8
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[7:8], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v9, v9, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v9, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v0
 ; GCN-IR-NEXT:  .LBB1_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v4
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GCN-IR-NEXT:    v_xor_b32_e32 v3, v9, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v2, v10, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v7, v6
+; GCN-IR-NEXT:    v_xor_b32_e32 v3, v8, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v2, v9, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index da1faae414ce9..dd5a59cb1e36e 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -288,18 +288,18 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_lshr_b64 v[16:17], v[0:1], v16
 ; GCN-NEXT:    v_lshl_b64 v[18:19], v[2:3], v8
 ; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
-; GCN-NEXT:    v_subrev_i32_e64 v9, s[6:7], 64, v8
+; GCN-NEXT:    v_subrev_i32_e64 v16, s[6:7], 64, v8
 ; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
+; GCN-NEXT:    v_lshl_b64 v[16:17], v[0:1], v16
+; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
-; GCN-NEXT:    v_lshl_b64 v[16:17], v[0:1], v9
 ; GCN-NEXT:    s_and_b64 vcc, s[4:5], vcc
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v16, v18, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v9, s[6:7], 64, v12
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v16, v2, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v11, v17, v19, vcc
 ; GCN-NEXT:    v_lshr_b64 v[9:10], v[4:5], v9
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[6:7], v12
@@ -337,18 +337,18 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v16
 ; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v8
 ; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
-; GCN-NEXT:    v_subrev_i32_e64 v9, s[6:7], 64, v8
+; GCN-NEXT:    v_subrev_i32_e64 v16, s[6:7], 64, v8
 ; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
+; GCN-NEXT:    v_lshr_b64 v[16:17], v[2:3], v16
+; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
-; GCN-NEXT:    v_lshr_b64 v[16:17], v[2:3], v9
 ; GCN-NEXT:    s_and_b64 vcc, s[4:5], vcc
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v16, v18, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v9, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v9, s[6:7], 64, v12
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v11, v17, v19, vcc
 ; GCN-NEXT:    v_lshl_b64 v[9:10], v[6:7], v9
 ; GCN-NEXT:    v_lshr_b64 v[16:17], v[4:5], v12
@@ -386,18 +386,18 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v16
 ; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v8
 ; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
-; GCN-NEXT:    v_subrev_i32_e64 v9, s[6:7], 64, v8
+; GCN-NEXT:    v_subrev_i32_e64 v16, s[6:7], 64, v8
 ; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
+; GCN-NEXT:    v_ashr_i64 v[16:17], v[2:3], v16
+; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
-; GCN-NEXT:    v_ashr_i64 v[16:17], v[2:3], v9
 ; GCN-NEXT:    s_and_b64 vcc, s[4:5], vcc
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v16, v18, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v9, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v9, s[6:7], 64, v12
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v11, v17, v19, vcc
 ; GCN-NEXT:    v_lshl_b64 v[9:10], v[6:7], v9
 ; GCN-NEXT:    v_lshr_b64 v[16:17], v[4:5], v12

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 7e6da2c321f79..8f99ab780ca93 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -918,20 +918,20 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
-; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshl_b64 v[9:10], v[9:10], v13
-; SI-NEXT:    v_lshl_b64 v[7:8], v[7:8], v11
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
+; SI-NEXT:    v_lshl_b64 v[6:7], v[6:7], v13
+; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], v11
+; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_v4i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 44881e8345e58..cac917902922e 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -614,20 +614,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
-; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
-; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ashr_i64 v[9:10], v[9:10], v13
-; SI-NEXT:    v_ashr_i64 v[7:8], v[7:8], v11
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
+; SI-NEXT:    v_ashr_i64 v[6:7], v[6:7], v13
+; SI-NEXT:    v_ashr_i64 v[4:5], v[4:5], v11
+; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v8
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: ashr_v4i64:
@@ -640,20 +640,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s6
 ; VI-NEXT:    s_mov_b32 s9, s7
-; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
-; VI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
-; VI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; VI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
-; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_ashrrev_i64 v[2:3], v10, v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ashrrev_i64 v[9:10], v13, v[9:10]
-; VI-NEXT:    v_ashrrev_i64 v[7:8], v11, v[7:8]
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
+; VI-NEXT:    v_ashrrev_i64 v[6:7], v13, v[6:7]
+; VI-NEXT:    v_ashrrev_i64 v[4:5], v11, v[4:5]
+; VI-NEXT:    v_ashrrev_i64 v[0:1], v8, v[0:1]
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: ashr_v4i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index a759896ac4249..5d21043a42b08 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -266,20 +266,20 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
-; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v6
-; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], v13
-; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], v11
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], v13
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v11
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], v8
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_v4i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 4eece54e2e2d3..13deecbc78857 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -755,14 +755,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
+; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v4, v20
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v17
 ; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
-; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v5, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
@@ -874,14 +874,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v4, v20
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v17
 ; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
-; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v5, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index b38f77e07fae6..92fc6efa45eaa 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -862,43 +862,43 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_mul_lo_u32 v14, v10, v0
 ; GCN-NEXT:    v_mul_lo_u32 v16, v11, v1
 ; GCN-NEXT:    v_mul_lo_u32 v18, v12, v2
-; GCN-NEXT:    v_mul_lo_u32 v19, v13, v3
+; GCN-NEXT:    v_mul_lo_u32 v20, v13, v3
 ; GCN-NEXT:    v_sub_u32_e32 v4, vcc, v4, v14
 ; GCN-NEXT:    v_sub_u32_e32 v5, vcc, v5, v16
 ; GCN-NEXT:    v_sub_u32_e32 v6, vcc, v6, v18
-; GCN-NEXT:    v_sub_u32_e32 v7, vcc, v7, v19
+; GCN-NEXT:    v_sub_u32_e32 v7, vcc, v7, v20
 ; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
 ; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
-; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
-; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
+; GCN-NEXT:    v_add_u32_e32 v19, vcc, 1, v12
+; GCN-NEXT:    v_add_u32_e32 v21, vcc, 1, v13
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
-; GCN-NEXT:    v_subrev_u32_e32 v18, vcc, v0, v4
+; GCN-NEXT:    v_subrev_u32_e32 v14, vcc, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[0:1]
 ; GCN-NEXT:    v_subrev_u32_e32 v15, vcc, v1, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v17, s[2:3]
-; GCN-NEXT:    v_subrev_u32_e32 v17, vcc, v2, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; GCN-NEXT:    v_subrev_u32_e32 v14, vcc, v3, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v16, s[6:7]
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v18, s[0:1]
-; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v10
+; GCN-NEXT:    v_subrev_u32_e32 v16, vcc, v2, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v19, s[4:5]
+; GCN-NEXT:    v_subrev_u32_e32 v17, vcc, v3, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v21, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s[0:1]
+; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s[2:3]
 ; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v11
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v17, s[4:5]
-; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v12
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v14, s[6:7]
-; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v13
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s[4:5]
+; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v12
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v17, s[6:7]
+; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v13
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v16, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v14, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v15, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v17, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v16, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v17, vcc
 ; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 4c0310f0153fa..64fc82df8c99c 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -304,6 +304,8 @@ define hidden void @blam() {
 ; GCN-NEXT:    v_writelane_b32 v40, s55, 23
 ; GCN-NEXT:    v_writelane_b32 v40, s56, 24
 ; GCN-NEXT:    v_writelane_b32 v40, s57, 25
+; GCN-NEXT:    v_writelane_b32 v40, s58, 26
+; GCN-NEXT:    v_writelane_b32 v40, s59, 27
 ; GCN-NEXT:    v_mov_b32_e32 v41, v31
 ; GCN-NEXT:    s_mov_b32 s46, s15
 ; GCN-NEXT:    s_mov_b32 s47, s14
@@ -316,23 +318,26 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b64 s[50:51], 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v41
 ; GCN-NEXT:    flat_load_dword v44, v[0:1]
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v41
 ; GCN-NEXT:    v_mov_b32_e32 v43, 0
-; GCN-NEXT:    v_lshlrev_b32_e32 v42, 2, v0
+; GCN-NEXT:    s_getpc_b64 s[52:53]
+; GCN-NEXT:    s_add_u32 s52, s52, spam at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s53, s53, spam at rel32@hi+12
+; GCN-NEXT:    v_lshlrev_b32_e32 v42, 2, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_f32_e64 s[52:53], 0, v44
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[54:55], 0, v44
 ; GCN-NEXT:    v_cmp_neq_f32_e64 s[42:43], 0, v44
 ; GCN-NEXT:    v_mov_b32_e32 v45, 0x7fc00000
 ; GCN-NEXT:    s_branch .LBB1_2
-; GCN-NEXT: LBB1_1: ; %Flow7
+; GCN-NEXT:  .LBB1_1: ; %Flow7
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:    s_or_b64 s[50:51], s[4:5], s[50:51]
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[50:51]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_18
-; GCN-NEXT: .LBB1_2: ; %bb2
+; GCN-NEXT:  .LBB1_2: ; %bb2
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    flat_load_dword v0, v[42:43]
 ; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
@@ -341,18 +346,15 @@ define hidden void @blam() {
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 2, v0
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-NEXT:    s_xor_b64 s[54:55], exec, s[8:9]
+; GCN-NEXT:    s_xor_b64 s[56:57], exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_12
-; GCN-NEXT: ; %bb.3: ; %bb6
+; GCN-NEXT:  ; %bb.3: ; %bb6
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[44:45], 3, v0
-; GCN-NEXT:    s_and_saveexec_b64 s[56:57], s[44:45]
+; GCN-NEXT:    s_and_saveexec_b64 s[58:59], s[44:45]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_11
-; GCN-NEXT: %bb.4: ; %bb11
+; GCN-NEXT:  ; %bb.4: ; %bb11
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_getpc_b64 s[16:17]
-; GCN-NEXT:    s_add_u32 s16, s16, spam at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s17, s17, spam at rel32@hi+12
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
@@ -362,63 +364,63 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b32 s14, s47
 ; GCN-NEXT:    s_mov_b32 s15, s46
 ; GCN-NEXT:    v_mov_b32_e32 v31, v41
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[52:53]
 ; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB1_10
-; GCN-NEXT: ; %bb.5: ; %bb14
+; GCN-NEXT:  ; %bb.5: ; %bb14
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_mov_b64 s[8:9], s[52:53]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[54:55]
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[42:43]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_7
-; GCN-NEXT: ; %bb.6: ; %bb16
+; GCN-NEXT:  ; %bb.6: ; %bb16
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
-; GCN-NEXT:    s_or_b64 s[8:9], s[52:53], exec
-; GCN-NEXT: .LBB1_7: ; %Flow3
+; GCN-NEXT:    s_or_b64 s[8:9], s[54:55], exec
+; GCN-NEXT:  .LBB1_7: ; %Flow3
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_xor_b64 s[8:9], exec, s[10:11]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_9
-; GCN-NEXT: ; %bb.8: ; %bb17
+; GCN-NEXT:  ; %bb.8: ; %bb17
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
-; GCN-NEXT: .LBB1_9: ; %Flow4
+; GCN-NEXT:  .LBB1_9: ; %Flow4
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GCN-NEXT: .LBB1_10: ; %Flow2
+; GCN-NEXT:  .LBB1_10: ; %Flow2
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[44:45], exec
 ; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[44:45], s[4:5], s[8:9]
 ; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GCN-NEXT: .LBB1_11: ; %Flow1
+; GCN-NEXT:  .LBB1_11: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_or_b64 exec, exec, s[56:57]
+; GCN-NEXT:    s_or_b64 exec, exec, s[58:59]
 ; GCN-NEXT:    s_orn2_b64 s[4:5], s[44:45], exec
 ; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT: .LBB1_12: ; %Flow
+; GCN-NEXT:  .LBB1_12: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_andn2_saveexec_b64 s[8:9], s[54:55]
+; GCN-NEXT:    s_andn2_saveexec_b64 s[8:9], s[56:57]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_16
-; GCN-NEXT: ; %bb.13: ; %bb8
+; GCN-NEXT:  ; %bb.13: ; %bb8
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GCN-NEXT:    s_and_saveexec_b64 s[12:13], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB1_15
-; GCN-NEXT: ; %bb.14: ; %bb10
+; GCN-NEXT:  ; %bb.14: ; %bb10
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
 ; GCN-NEXT:    s_or_b64 s[10:11], s[6:7], exec
-; GCN-NEXT: .LBB1_15: ; %Flow6
+; GCN-NEXT:  .LBB1_15: ; %Flow6
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
@@ -427,18 +429,20 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_and_b64 s[10:11], s[10:11], exec
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GCN-NEXT: .LBB1_16: ; %Flow5
+; GCN-NEXT:  .LBB1_16: ; %Flow5
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_1
-; GCN-NEXT: ; %bb.17: ; %bb18
+; GCN-NEXT:  ; %bb.17: ; %bb18
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_branch .LBB1_1
-; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock
+; GCN-NEXT:  .LBB1_18: ; %DummyReturnBlock
 ; GCN-NEXT:    s_or_b64 exec, exec, s[50:51]
+; GCN-NEXT:    v_readlane_b32 s59, v40, 27
+; GCN-NEXT:    v_readlane_b32 s58, v40, 26
 ; GCN-NEXT:    v_readlane_b32 s57, v40, 25
 ; GCN-NEXT:    v_readlane_b32 s56, v40, 24
 ; GCN-NEXT:    v_readlane_b32 s55, v40, 23

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 172744e060cbf..6d4c60eb221df 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -44,7 +44,7 @@
 # FULL-NEXT: fp64-fp16-input-denormals: true
 # FULL-NEXT: fp64-fp16-output-denormals: true
 # FULL-NEXT:  highBitsOf32BitAddress: 0
-# FULL-NEXT:  occupancy: 10
+# FULL-NEXT:  occupancy: 8
 # FULL-NEXT:  vgprForAGPRCopy: ''
 # FULL-NEXT: body:
 
@@ -74,7 +74,7 @@
 # SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' }
 # SIMPLE-NEXT: workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
 # SIMPLE-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
-# SIMPLE-NEXT: occupancy: 10
+# SIMPLE-NEXT: occupancy: 8
 # SIMPLE-NEXT: body:
 name: kernel0
 machineFunctionInfo:
@@ -142,7 +142,7 @@ body:             |
 # FULL-NEXT: fp64-fp16-input-denormals: true
 # FULL-NEXT: fp64-fp16-output-denormals: true
 # FULL-NEXT:  highBitsOf32BitAddress: 0
-# FULL-NEXT:  occupancy: 10
+# FULL-NEXT:  occupancy: 8
 # FULL-NEXT: vgprForAGPRCopy: ''
 # FULL-NEXT: body:
 
@@ -161,7 +161,7 @@ body:             |
 # SIMPLE-NEXT: workItemIDX:     { reg: '$vgpr31', mask: 1023 }
 # SIMPLE-NEXT: workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
 # SIMPLE-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
-# SIMPLE-NEXT:  occupancy: 10
+# SIMPLE-NEXT:  occupancy: 8
 # SIMPLE-NEXT: body:
 
 name: no_mfi
@@ -211,7 +211,7 @@ body:             |
 # FULL-NEXT: fp64-fp16-input-denormals: true
 # FULL-NEXT: fp64-fp16-output-denormals: true
 # FULL-NEXT:  highBitsOf32BitAddress: 0
-# FULL-NEXT:  occupancy: 10
+# FULL-NEXT:  occupancy: 8
 # FULL-NEXT: vgprForAGPRCopy: ''
 # FULL-NEXT: body:
 
@@ -230,7 +230,7 @@ body:             |
 # SIMPLE-NEXT: workItemIDX:     { reg: '$vgpr31', mask: 1023 }
 # SIMPLE-NEXT: workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
 # SIMPLE-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
-# SIMPLE-NEXT:  occupancy: 10
+# SIMPLE-NEXT:  occupancy: 8
 # SIMPLE-NEXT: body:
 
 name: empty_mfi
@@ -281,7 +281,7 @@ body:             |
 # FULL-NEXT: fp64-fp16-input-denormals: true
 # FULL-NEXT: fp64-fp16-output-denormals: true
 # FULL-NEXT:  highBitsOf32BitAddress: 0
-# FULL-NEXT:  occupancy: 10
+# FULL-NEXT:  occupancy: 8
 # FULL-NEXT: vgprForAGPRCopy: ''
 # FULL-NEXT: body:
 
@@ -301,7 +301,7 @@ body:             |
 # SIMPLE-NEXT: workItemIDX:     { reg: '$vgpr31', mask: 1023 }
 # SIMPLE-NEXT: workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
 # SIMPLE-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
-# SIMPLE-NEXT: occupancy: 10
+# SIMPLE-NEXT: occupancy: 8
 # SIMPLE-NEXT: body:
 
 name: empty_mfi_entry_func
@@ -430,7 +430,7 @@ body:             |
 
 ---
 # ALL-LABEL: name: occupancy_0
-# ALL: occupancy: 10
+# ALL: occupancy: 8
 name: occupancy_0
 machineFunctionInfo:
   occupancy: 0

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index 26a35113dae4e..a3ed1f25ebd93 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -38,7 +38,7 @@
 ; CHECK-NEXT: fp64-fp16-input-denormals: true
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
-; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: occupancy: 8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
@@ -132,7 +132,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: fp64-fp16-input-denormals: true
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
-; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: occupancy: 8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: body:
 define void @function() {
@@ -178,7 +178,7 @@ define void @function() {
 ; CHECK-NEXT: fp64-fp16-input-denormals: true
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
-; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: occupancy: 8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: body:
 define void @function_nsz() #0 {


        


More information about the llvm-commits mailing list