[llvm] ba0d079 - [AMDGPU] Aggressively schedule to reduce RP in occupancy limited regions

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 27 22:42:49 PDT 2022


Author: Austin Kerbow
Date: 2022-07-27T22:34:37-07:00
New Revision: ba0d079c7aa52bc0ae860d16dd4a33b0dc5cfff7

URL: https://github.com/llvm/llvm-project/commit/ba0d079c7aa52bc0ae860d16dd4a33b0dc5cfff7
DIFF: https://github.com/llvm/llvm-project/commit/ba0d079c7aa52bc0ae860d16dd4a33b0dc5cfff7.diff

LOG: [AMDGPU] Aggressively schedule to reduce RP in occupancy limited regions

By not clustering loads and adjusting heuristics to more aggressively reduce
register pressure we may be able to increase occupancy for the function if it
was dropped in a first pass scheduling.

Similarly, try to reduce spilling if register usage exceeds lower bound
occupancy.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D130329

Added: 
    llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir

Modified: 
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
    llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir
    llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
    llvm/test/CodeGen/AMDGPU/pr51516.mir
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 859deae86f35e..7668da84fd55c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -31,10 +31,17 @@
 
 using namespace llvm;
 
+cl::opt<bool>
+    DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule",
+                           cl::Hidden,
+                           cl::desc("Disable unclustred high register pressure "
+                                    "reduction scheduling stage."),
+                           cl::init(false));
+
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
-      HasClusteredNodes(false), HasExcessPressure(false) {}
+      HasHighPressure(false) {}
 
 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
@@ -43,10 +50,6 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
 
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 
-  // FIXME: This is also necessary, because some passes that run after
-  // scheduling and before regalloc increase register pressure.
-  const unsigned ErrorMargin = 3;
-
   SGPRExcessLimit =
       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
   VGPRExcessLimit =
@@ -121,13 +124,13 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // marked as RegExcess in tryCandidate() when they are compared with
   // instructions that increase the register pressure.
   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
-    HasExcessPressure = true;
+    HasHighPressure = true;
     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
   }
 
   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
-    HasExcessPressure = true;
+    HasHighPressure = true;
     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
   }
@@ -141,7 +144,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
 
   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
-    HasExcessPressure = true;
+    HasHighPressure = true;
     if (SGPRDelta > VGPRDelta) {
       Cand.RPDelta.CriticalMax =
         PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
@@ -300,15 +303,6 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
   if (SU->isBottomReady())
     Bot.removeReady(SU);
 
-  if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) {
-    for (SDep &Dep : SU->Preds) {
-      if (Dep.isCluster()) {
-        HasClusteredNodes = true;
-        break;
-      }
-    }
-  }
-
   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
                     << *SU->getInstr());
   return SU;
@@ -426,12 +420,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
   LiveIns.resize(Regions.size());
   Pressure.resize(Regions.size());
   RescheduleRegions.resize(Regions.size());
-  RegionsWithClusters.resize(Regions.size());
   RegionsWithHighRP.resize(Regions.size());
+  RegionsWithExcessRP.resize(Regions.size());
   RegionsWithMinOcc.resize(Regions.size());
   RescheduleRegions.set();
-  RegionsWithClusters.reset();
   RegionsWithHighRP.reset();
+  RegionsWithExcessRP.reset();
   RegionsWithMinOcc.reset();
 
   runSchedStages();
@@ -440,7 +434,8 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
 void GCNScheduleDAGMILive::runSchedStages() {
   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
   InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
-  UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this);
+  UnclusteredHighRPStage S1(GCNSchedStageID::UnclusteredHighRPReschedule,
+                            *this);
   ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
                           *this);
   PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
@@ -477,8 +472,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
   case GCNSchedStageID::InitialSchedule:
     OS << "Initial Schedule";
     break;
-  case GCNSchedStageID::UnclusteredReschedule:
-    OS << "Unclustered Reschedule";
+  case GCNSchedStageID::UnclusteredHighRPReschedule:
+    OS << "Unclustered High Register Pressure Reschedule";
     break;
   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
     OS << "Clustered Low Occupancy Reschedule";
@@ -503,16 +498,30 @@ bool GCNSchedStage::initGCNSchedStage() {
   return true;
 }
 
-bool UnclusteredRescheduleStage::initGCNSchedStage() {
+bool UnclusteredHighRPStage::initGCNSchedStage() {
+  if (DisableUnclusterHighRP)
+    return false;
+
   if (!GCNSchedStage::initGCNSchedStage())
     return false;
 
-  if (DAG.RescheduleRegions.none())
+  if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
     return false;
 
   SavedMutations.swap(DAG.Mutations);
+  InitialOccupancy = DAG.MinOccupancy;
+  // Aggressivly try to reduce register pressure in the unclustered high RP
+  // stage. Temporarily increase occupancy target in the region.
+  S.ErrorMargin = S.HighRPErrorMargin;
+  if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
+    MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
+
+  LLVM_DEBUG(
+      dbgs()
+      << "Retrying function scheduling without clustering. "
+         "Aggressivly try to reduce register pressure to achieve occupancy "
+      << DAG.MinOccupancy << ".\n");
 
-  LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n");
   return true;
 }
 
@@ -565,8 +574,18 @@ void GCNSchedStage::finalizeGCNSchedStage() {
   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
 }
 
-void UnclusteredRescheduleStage::finalizeGCNSchedStage() {
+void UnclusteredHighRPStage::finalizeGCNSchedStage() {
   SavedMutations.swap(DAG.Mutations);
+  S.ErrorMargin = S.DefaultErrorMargin;
+  if (DAG.MinOccupancy > InitialOccupancy) {
+    for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
+      DAG.RegionsWithMinOcc[IDX] =
+          DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
+
+    LLVM_DEBUG(dbgs() << StageID
+                      << " stage successfully increased occupancy to "
+                      << DAG.MinOccupancy << '\n');
+  }
 
   GCNSchedStage::finalizeGCNSchedStage();
 }
@@ -606,29 +625,29 @@ bool GCNSchedStage::initGCNRegion() {
       llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs());
       dbgs() << "Region register pressure: "; PressureBefore.print(dbgs()));
 
-  // Set HasClusteredNodes to true for late stages where we have already
-  // collected it. That way pickNode() will not scan SDep's when not needed.
-  S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule;
-  S.HasExcessPressure = false;
+  S.HasHighPressure = false;
 
   return true;
 }
 
-bool UnclusteredRescheduleStage::initGCNRegion() {
-  if (!DAG.RescheduleRegions[RegionIdx])
+bool UnclusteredHighRPStage::initGCNRegion() {
+  // Only reschedule regions with the minimum occupancy or regions that may have
+  // spilling (excess register pressure).
+  if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
+       DAG.MinOccupancy <= InitialOccupancy) &&
+      !DAG.RegionsWithExcessRP[RegionIdx])
     return false;
 
   return GCNSchedStage::initGCNRegion();
 }
 
 bool ClusteredLowOccStage::initGCNRegion() {
-  // We may need to reschedule this region if it doesn't have clusters so it
-  // wasn't rescheduled in the last stage, or if we found it was testing
-  // critical register pressure limits in the unclustered reschedule stage. The
-  // later is because we may not have been able to raise the min occupancy in
-  // the previous stage so the region may be overly constrained even if it was
-  // already rescheduled.
-  if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx])
+  // We may need to reschedule this region if it wasn't rescheduled in the last
+  // stage, or if we found it was testing critical register pressure limits in
+  // the unclustered reschedule stage. The later is because we may not have been
+  // able to raise the min occupancy in the previous stage so the region may be
+  // overly constrained even if it was already rescheduled.
+  if (!DAG.RegionsWithHighRP[RegionIdx])
     return false;
 
   return GCNSchedStage::initGCNRegion();
@@ -656,7 +675,7 @@ void GCNSchedStage::setupNewBlock() {
 void GCNSchedStage::finalizeGCNRegion() {
   DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
   DAG.RescheduleRegions[RegionIdx] = false;
-  if (S.HasExcessPressure)
+  if (S.HasHighPressure)
     DAG.RegionsWithHighRP[RegionIdx] = true;
 
   // Revert scheduling if we have dropped occupancy or there is some other
@@ -667,16 +686,6 @@ void GCNSchedStage::finalizeGCNRegion() {
   RegionIdx++;
 }
 
-void InitialScheduleStage::finalizeGCNRegion() {
-  // Record which regions have clustered nodes for the next unclustered
-  // reschedule stage.
-  assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
-  if (S.HasClusteredNodes)
-    DAG.RegionsWithClusters[RegionIdx] = true;
-
-  GCNSchedStage::finalizeGCNRegion();
-}
-
 void GCNSchedStage::checkScheduling() {
   // Check the results of scheduling.
   PressureAfter = DAG.getRealRegPressure(RegionIdx);
@@ -731,6 +740,7 @@ void GCNSchedStage::checkScheduling() {
       PressureAfter.getSGPRNum() > MaxSGPRs) {
     DAG.RescheduleRegions[RegionIdx] = true;
     DAG.RegionsWithHighRP[RegionIdx] = true;
+    DAG.RegionsWithExcessRP[RegionIdx] = true;
   }
 
   // Revert if this region's schedule would cause a drop in occupancy or
@@ -758,21 +768,15 @@ bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
   if (mayCauseSpilling(WavesAfter))
     return true;
 
-  assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
-  // Don't reschedule the region in the next stage if it doesn't have clusters.
-  if (!DAG.RegionsWithClusters[RegionIdx])
-    DAG.RescheduleRegions[RegionIdx] = false;
-
   return false;
 }
 
-bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
-  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
-    return true;
-
-  // If RP is not reduced in the unclustred reschedule stage, revert to the old
-  // schedule.
-  if (!PressureAfter.less(ST, PressureBefore)) {
+bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
+  // If RP is not reduced in the unclustred reschedule stage, revert to the
+  // old schedule.
+  if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
+       mayCauseSpilling(WavesAfter)) ||
+      GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
     return true;
   }
@@ -803,7 +807,7 @@ bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
   if (WavesAfter <= MFI.getMinWavesPerEU() &&
       !PressureAfter.less(ST, PressureBefore) &&
-      DAG.RescheduleRegions[RegionIdx]) {
+      DAG.RegionsWithExcessRP[RegionIdx]) {
     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     return true;
   }
@@ -816,8 +820,7 @@ void GCNSchedStage::revertScheduling() {
       PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   DAG.RescheduleRegions[RegionIdx] =
-      DAG.RegionsWithClusters[RegionIdx] ||
-      (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule;
+      (nextStage(StageID)) != GCNSchedStageID::UnclusteredHighRPReschedule;
   DAG.RegionEnd = DAG.RegionBegin;
   int SkippedDebugInstr = 0;
   for (MachineInstr *MI : Unsched) {

diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 7aadf89e0bf7f..ffa68bae67a1c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -52,13 +52,18 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
   MachineFunction *MF;
 
 public:
-  // schedule() have seen a clustered memory operation. Set it to false
-  // before a region scheduling to know if the region had such clusters.
-  bool HasClusteredNodes;
+  // schedule() have seen register pressure over the critical limits and had to
+  // track register pressure for actual scheduling heuristics.
+  bool HasHighPressure;
 
-  // schedule() have seen an excess register pressure and had to track
-  // register pressure for actual scheduling heuristics.
-  bool HasExcessPressure;
+  // An error margin is necessary because of poor performance of the generic RP
+  // tracker and can be adjusted up for tuning heuristics to try and more
+  // aggressively reduce register pressure.
+  const unsigned DefaultErrorMargin = 3;
+
+  const unsigned HighRPErrorMargin = 10;
+
+  unsigned ErrorMargin = DefaultErrorMargin;
 
   unsigned SGPRCriticalLimit;
 
@@ -77,7 +82,7 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
 
 enum class GCNSchedStageID : unsigned {
   InitialSchedule = 0,
-  UnclusteredReschedule = 1,
+  UnclusteredHighRPReschedule = 1,
   ClusteredLowOccupancyReschedule = 2,
   PreRARematerialize = 3,
   LastStage = PreRARematerialize
@@ -104,7 +109,7 @@ inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) {
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class InitialScheduleStage;
-  friend class UnclusteredRescheduleStage;
+  friend class UnclusteredHighRPStage;
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
 
@@ -126,12 +131,13 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // or we generally desire to reschedule it.
   BitVector RescheduleRegions;
 
-  // Record regions which use clustered loads/stores.
-  BitVector RegionsWithClusters;
-
   // Record regions with high register pressure.
   BitVector RegionsWithHighRP;
 
+  // Record regions with excess register pressure over the physical register
+  // limit. Register pressure in these regions usually will result in spilling.
+  BitVector RegionsWithExcessRP;
+
   // Regions that has the same occupancy as the latest MinOccupancy
   BitVector RegionsWithMinOcc;
 
@@ -220,7 +226,7 @@ class GCNSchedStage {
   void setupNewBlock();
 
   // Finalize state after scheudling a region.
-  virtual void finalizeGCNRegion();
+  void finalizeGCNRegion();
 
   // Check result of scheduling.
   void checkScheduling();
@@ -241,18 +247,19 @@ class GCNSchedStage {
 
 class InitialScheduleStage : public GCNSchedStage {
 public:
-  void finalizeGCNRegion() override;
-
   bool shouldRevertScheduling(unsigned WavesAfter) override;
 
   InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };
 
-class UnclusteredRescheduleStage : public GCNSchedStage {
+class UnclusteredHighRPStage : public GCNSchedStage {
 private:
   std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
 
+  // Save the initial occupancy before starting this stage.
+  unsigned InitialOccupancy;
+
 public:
   bool initGCNSchedStage() override;
 
@@ -262,7 +269,7 @@ class UnclusteredRescheduleStage : public GCNSchedStage {
 
   bool shouldRevertScheduling(unsigned WavesAfter) override;
 
-  UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+  UnclusteredHighRPStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };
 

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d71f80c5f4583..5a8be4d12de1e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -30,10 +30,9 @@ class GCNTargetMachine;
 
 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
                            public AMDGPUSubtarget {
-
+public:
   using AMDGPUSubtarget::getMaxWavesPerEU;
 
-public:
   // Following 2 enums are documented at:
   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
   enum class TrapHandlerAbi {

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
index 093111b755c01..88ab58d2b2126 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
@@ -501,24 +501,24 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -527,24 +527,24 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-DENORM-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; GFX9-DENORM-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
-; GFX9-DENORM-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX9-DENORM-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
-; GFX9-DENORM-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-DENORM-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
+; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
+; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
+; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -616,24 +616,24 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -642,24 +642,24 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-DENORM-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; GFX9-DENORM-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
-; GFX9-DENORM-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX9-DENORM-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
-; GFX9-DENORM-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-DENORM-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
+; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
+; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
+; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index 1c6c8dee69cf5..b0bb5c516c325 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -104,7 +104,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 8, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x8
 ; GFX11-NEXT:    global_load_b128 v[32:35], v64, s[0:1]
 ; GFX11-NEXT:    global_load_b128 v[36:39], v64, s[0:1] offset:16
 ; GFX11-NEXT:    global_load_b128 v[40:43], v64, s[0:1] offset:32
@@ -114,10 +114,10 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GFX11-NEXT:    global_load_b128 v[56:59], v64, s[0:1] offset:96
 ; GFX11-NEXT:    global_load_b128 v[60:63], v64, s[0:1] offset:112
 ; GFX11-NEXT:    global_load_b128 v[4:7], v64, s[0:1] offset:144
-; GFX11-NEXT:    global_load_b128 v[0:3], v64, s[0:1] offset:128
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, 0x3e7
-; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    s_clause 0x6
+; GFX11-NEXT:    global_load_b128 v[0:3], v64, s[0:1] offset:128
 ; GFX11-NEXT:    global_load_b128 v[8:11], v64, s[0:1] offset:160
 ; GFX11-NEXT:    global_load_b128 v[12:15], v64, s[0:1] offset:176
 ; GFX11-NEXT:    global_load_b128 v[16:19], v64, s[0:1] offset:192
@@ -131,8 +131,10 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    global_store_b128 v64, v[8:11], s[2:3] offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    s_clause 0x8
 ; GFX11-NEXT:    global_store_b128 v64, v[12:15], s[2:3] offset:176
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_clause 0x8
+; GFX11-NEXT:    global_store_b128 v64, v[16:19], s[2:3] offset:192
 ; GFX11-NEXT:    global_store_b128 v64, v[32:35], s[2:3]
 ; GFX11-NEXT:    global_store_b128 v64, v[36:39], s[2:3] offset:16
 ; GFX11-NEXT:    global_store_b128 v64, v[40:43], s[2:3] offset:32
@@ -141,8 +143,6 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GFX11-NEXT:    global_store_b128 v64, v[52:55], s[2:3] offset:80
 ; GFX11-NEXT:    global_store_b128 v64, v[56:59], s[2:3] offset:96
 ; GFX11-NEXT:    global_store_b128 v64, v[60:63], s[2:3] offset:112
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    global_store_b128 v64, v[16:19], s[2:3] offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    global_store_b128 v64, v[20:23], s[2:3] offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 8c5c2e32844da..d5ad0062aff3e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1645,211 +1645,208 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-LABEL: v_mul_i256:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v16, v0
-; GFX7-NEXT:    v_mov_b32_e32 v17, v1
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GFX7-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT:    v_mul_lo_u32 v26, v5, v10
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT:    v_mul_lo_u32 v28, v4, v11
+; GFX7-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
 ; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
 ; GFX7-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
 ; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e32 v22, vcc, 0, v20, vcc
+; GFX7-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
 ; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e32 v24, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v18
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX7-NEXT:    v_addc_u32_e32 v25, vcc, 0, v20, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v20, v18
 ; GFX7-NEXT:    v_mov_b32_e32 v18, v19
-; GFX7-NEXT:    v_mov_b32_e32 v19, v20
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
-; GFX7-NEXT:    v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v23
-; GFX7-NEXT:    v_mul_lo_u32 v23, v4, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v13, v2, v13
-; GFX7-NEXT:    v_mul_lo_u32 v20, v6, v9
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
+; GFX7-NEXT:    v_mov_b32_e32 v19, v16
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX7-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX7-NEXT:    v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX7-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX7-NEXT:    v_mov_b32_e32 v19, v22
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX7-NEXT:    v_mul_lo_u32 v24, v3, v12
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX7-NEXT:    v_mul_lo_u32 v22, v2, v13
+; GFX7-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
+; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
 ; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, v22
-; GFX7-NEXT:    v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX7-NEXT:    v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX7-NEXT:    v_mov_b32_e32 v20, v11
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
 ; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
-; GFX7-NEXT:    v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13]
-; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
-; GFX7-NEXT:    v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13]
-; GFX7-NEXT:    v_mul_lo_u32 v11, v16, v15
-; GFX7-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v14
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i256:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v16, v0
-; GFX8-NEXT:    v_mov_b32_e32 v17, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT:    v_mul_lo_u32 v26, v5, v10
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT:    v_mul_lo_u32 v28, v4, v11
+; GFX8-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
 ; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
 ; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
 ; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v20, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
 ; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v18
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v20, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v20, v18
 ; GFX8-NEXT:    v_mov_b32_e32 v18, v19
-; GFX8-NEXT:    v_mov_b32_e32 v19, v20
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
-; GFX8-NEXT:    v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v23
-; GFX8-NEXT:    v_mul_lo_u32 v23, v4, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v13, v2, v13
-; GFX8-NEXT:    v_mul_lo_u32 v20, v6, v9
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
+; GFX8-NEXT:    v_mov_b32_e32 v19, v16
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX8-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX8-NEXT:    v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX8-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX8-NEXT:    v_mov_b32_e32 v19, v22
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX8-NEXT:    v_mul_lo_u32 v24, v3, v12
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX8-NEXT:    v_mul_lo_u32 v22, v2, v13
+; GFX8-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
 ; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, v22
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX8-NEXT:    v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX8-NEXT:    v_mov_b32_e32 v20, v11
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
 ; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13]
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13]
-; GFX8-NEXT:    v_mul_lo_u32 v11, v16, v15
-; GFX8-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i256:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v16, v0
-; GFX9-NEXT:    v_mov_b32_e32 v17, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT:    v_mul_lo_u32 v26, v5, v10
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT:    v_mul_lo_u32 v28, v4, v11
+; GFX9-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
 ; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v20, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v18
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v20, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v20, v18
 ; GFX9-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-NEXT:    v_mov_b32_e32 v19, v20
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
-; GFX9-NEXT:    v_addc_co_u32_e64 v25, s[4:5], 0, v0, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v23
-; GFX9-NEXT:    v_mul_lo_u32 v23, v4, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v13, v2, v13
-; GFX9-NEXT:    v_mul_lo_u32 v20, v6, v9
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
+; GFX9-NEXT:    v_mov_b32_e32 v19, v16
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX9-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX9-NEXT:    v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX9-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX9-NEXT:    v_mul_lo_u32 v24, v3, v12
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX9-NEXT:    v_mul_lo_u32 v22, v2, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, v22
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[12:13], 0, v4, s[12:13]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX9-NEXT:    v_mov_b32_e32 v20, v11
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[12:13], 0, v10, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[12:13], v11, v3, s[12:13]
-; GFX9-NEXT:    v_mul_lo_u32 v11, v16, v15
-; GFX9-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v25, v4, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v10, v5, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[12:13], v24, v6, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[12:13], v21, v11, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[12:13], v10, v9, s[14:15]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[10:11], v9, v13, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v9, v27, s[8:9]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[6:7], v9, v23, s[6:7]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], v9, v26, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i256:

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 2ec97e4a8e510..aff17ca623cd6 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -529,7 +529,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX908-NEXT:    s_sub_i32 s4, 0, s1
+; GFX908-NEXT:    s_sub_i32 s7, 0, s1
 ; GFX908-NEXT:    s_lshr_b32 s5, s6, 16
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v25, s6
 ; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -539,10 +539,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX908-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v7, s3
-; GFX908-NEXT:    s_lshl_b64 s[6:7], s[8:9], 5
-; GFX908-NEXT:    v_mov_b32_e32 v6, s2
-; GFX908-NEXT:    v_mul_lo_u32 v2, s4, v0
 ; GFX908-NEXT:    s_mov_b32 s4, 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s2
+; GFX908-NEXT:    v_mul_lo_u32 v2, s7, v0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], s[8:9], 5
 ; GFX908-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX908-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX908-NEXT:    v_mul_hi_u32 v0, s0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir
index fe2e03aec01bb..24e80c9c177c3 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir
@@ -13,14 +13,6 @@
 # CHECK-NEXT:   From: DBG_VALUE %17:vgpr_32, 0, 0
 # CHECK-NEXT:     To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32
 # CHECK-NEXT:  RegionInstrs: 46
-# CHECK: Unclustered reschedule did not help.
-# CHECK: Attempting to revert scheduling.
-# CHECK: Retrying function scheduling with lowest recorded occupancy 3.
-# CHECK: ********** MI Scheduling **********
-# CHECK: test_same_num_instrs:%bb.2
-# CHECK-NEXT:   From: DBG_VALUE %17:vgpr_32, 0, 0
-# CHECK-NEXT:     To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32
-# CHECK-NEXT:  RegionInstrs: 46
 # CHECK: Attempting to revert scheduling.
 
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
new file mode 100644
index 0000000000000..c1b78d430dca9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
@@ -0,0 +1,144 @@
+# REQUIRES: asserts
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @high-RP-reschedule() { ret void }
+...
+
+# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+
+---
+name: high-RP-reschedule
+tracksRegLiveness: true
+machineFunctionInfo:
+  occupancy: 4
+body: |
+  bb.0:
+    %0:vreg_128 = IMPLICIT_DEF
+    %1:vreg_128 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vreg_128 = IMPLICIT_DEF
+    %4:vreg_128 = IMPLICIT_DEF
+    %5:vreg_128 = IMPLICIT_DEF
+    %6:vreg_128 = IMPLICIT_DEF
+    %7:vreg_128 = IMPLICIT_DEF
+    %8:vreg_128 = IMPLICIT_DEF
+    %9:vreg_128 = IMPLICIT_DEF
+    %10:vreg_128 = IMPLICIT_DEF
+    %11:sreg_64_xexec = IMPLICIT_DEF
+    %12:vreg_64 = IMPLICIT_DEF
+
+  bb.1:
+    %13:vgpr_32 = V_LSHRREV_B16_e32 1, %12.sub0, implicit $exec
+    %14:vgpr_32 = V_AND_B32_e32 127, %13, implicit $exec
+    %15:vgpr_32 = V_MUL_LO_U16_e32 49, %14, implicit $exec
+    %16:vgpr_32 = V_LSHRREV_B16_e32 10, %15, implicit $exec
+    %17:vgpr_32 = V_MUL_LO_U16_e32 42, %16, implicit $exec
+    %18:vgpr_32 = V_SUB_U16_e32 %12.sub0, %17, implicit $exec
+    %19:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    %20:vgpr_32 = V_MUL_U32_U24_sdwa 0, %18, 0, %19, 0, 6, 0, 0, 6, implicit $exec
+    %21:vgpr_32 = V_LSHLREV_B32_e32 4, %20, implicit $exec
+    %22:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 608, 0, implicit $exec :: (load (s128))
+    %23:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 576, 0, implicit $exec :: (load (s128))
+    %24:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 592, 0, implicit $exec :: (load (s128))
+    %25:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 624, 0, implicit $exec :: (load (s128))
+    %26:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 672, 0, implicit $exec :: (load (s128))
+    %27:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 640, 0, implicit $exec :: (load (s128))
+    %28:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 656, 0, implicit $exec :: (load (s128))
+    %29:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    %30:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %31:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub0_sub1, 1, %29, 0, 0, implicit $mode, implicit $exec
+    %32:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub2_sub3, 0, %30, 0, 0, implicit $mode, implicit $exec
+    %33:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    %34:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %35:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub0_sub1, 1, %33, 0, 0, implicit $mode, implicit $exec
+    %36:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub2_sub3, 0, %34, 0, 0, implicit $mode, implicit $exec
+    %37:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    %38:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %39:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub0_sub1, 1, %37, 0, 0, implicit $mode, implicit $exec
+    %40:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub2_sub3, 0, %38, 0, 0, implicit $mode, implicit $exec
+    %41:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    %42:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %43:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub0_sub1, 1, %41, 0, 0, implicit $mode, implicit $exec
+    %44:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub2_sub3, 0, %42, 0, 0, implicit $mode, implicit $exec
+    %45:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    %46:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %47:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    %48:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub0_sub1, 1, %45, 0, 0, implicit $mode, implicit $exec
+    %49:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub2_sub3, 0, %46, 0, 0, implicit $mode, implicit $exec
+    %50:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    %51:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %52:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %53:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub0_sub1, 1, %47, 0, 0, implicit $mode, implicit $exec
+    %54:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub0_sub1, 1, %50, 0, 0, implicit $mode, implicit $exec
+    %55:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub2_sub3, 0, %51, 0, 0, implicit $mode, implicit $exec
+    %56:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub2_sub3, 0, %52, 0, 0, implicit $mode, implicit $exec
+    %57:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub2_sub3, 1, %32, 0, 0, implicit $mode, implicit $exec
+    %58:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %35, 1, %39, 0, 0, implicit $mode, implicit $exec
+    %59:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %44, 1, %49, 0, 0, implicit $mode, implicit $exec
+    %60:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %53, 1, %54, 0, 0, implicit $mode, implicit $exec
+    %61:sreg_64 = S_MOV_B64_IMM_PSEUDO 4604544271217802189
+    %62:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub0_sub1, 1, %31, 0, 0, implicit $mode, implicit $exec
+    undef %63.sub1:sreg_64 = S_MOV_B32 -1075404642
+    %64:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %36, 1, %40, 0, 0, implicit $mode, implicit $exec
+    %65:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %43, 1, %48, 0, 0, implicit $mode, implicit $exec
+    %66:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %55, 1, %56, 0, 0, implicit $mode, implicit $exec
+    %67:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %57, 0, %58, 0, 0, implicit $mode, implicit $exec
+    %68:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %59, 0, %60, 0, 0, implicit $mode, implicit $exec
+    %69:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub2_sub3, 0, 4611686018427387904, 1, %57, 0, 0, implicit $mode, implicit $exec
+    %70:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub0_sub1, 0, 4611686018427387904, 1, %62, 0, 0, implicit $mode, implicit $exec
+    %71:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %35, 0, 4611686018427387904, 1, %58, 0, 0, implicit $mode, implicit $exec
+    %72:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %36, 0, 4611686018427387904, 1, %64, 0, 0, implicit $mode, implicit $exec
+    %73:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %43, 0, 4611686018427387904, 1, %65, 0, 0, implicit $mode, implicit $exec
+    %74:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %44, 0, 4611686018427387904, 1, %59, 0, 0, implicit $mode, implicit $exec
+    %75:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %55, 0, 4611686018427387904, 1, %66, 0, 0, implicit $mode, implicit $exec
+    %76:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %53, 0, 4611686018427387904, 1, %60, 0, 0, implicit $mode, implicit $exec
+    %77:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %62, 1, %64, 0, 0, implicit $mode, implicit $exec
+    %78:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %65, 1, %66, 0, 0, implicit $mode, implicit $exec
+    %79:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %70, 1, %71, 0, 0, implicit $mode, implicit $exec
+    %80:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %57, 0, 4611686018427387904, 1, %67, 0, 0, implicit $mode, implicit $exec
+    %81:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %59, 0, 4611686018427387904, 1, %68, 0, 0, implicit $mode, implicit $exec
+    %82:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %69, 1, %72, 0, 0, implicit $mode, implicit $exec
+    %83:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %74, 1, %75, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %73, 1, %76, 0, 0, implicit $mode, implicit $exec
+    %85:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %62, 0, 4611686018427387904, 1, %77, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %65, 0, 4611686018427387904, 1, %78, 0, 0, implicit $mode, implicit $exec
+    %63.sub0:sreg_64 = COPY %61.sub0
+    %87:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %61, 0, %67, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %80, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %70, 0, 4611686018427387904, 1, %79, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %69, 0, 4611686018427387904, 1, %82, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %74, 0, 4611686018427387904, 1, %83, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %73, 0, 4611686018427387904, 1, %84, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %63, 0, %85, 0, 0, implicit $mode, implicit $exec
+    %94:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %77, 0, 0, implicit $mode, implicit $exec
+    undef %95.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %87, 0, 0, implicit $mode, implicit $exec
+    undef %96.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %82, 0, %84, 0, 0, implicit $mode, implicit $exec
+    undef %97.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %61, 0, %88, 0, 0, implicit $mode, implicit $exec
+    undef %98.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %90, 1, %91, 0, 0, implicit $mode, implicit $exec
+    %98.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %89, 1, %92, 0, 0, implicit $mode, implicit $exec
+    %97.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %93, 0, 0, implicit $mode, implicit $exec
+    %96.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %79, 1, %83, 0, 0, implicit $mode, implicit $exec
+    %95.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %63, 0, %94, 0, 0, implicit $mode, implicit $exec
+    undef %99.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %67, 0, 4611686018427387904, 1, %95.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    undef %100.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %82, 0, 4611686018427387904, 1, %96.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    undef %101.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %80, 0, 4611686018427387904, 1, %97.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    undef %102.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %90, 0, 4611686018427387904, 1, %98.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+    %102.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %89, 0, 4611686018427387904, 1, %98.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %101.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %85, 0, 4611686018427387904, 1, %97.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %100.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %79, 0, 4611686018427387904, 1, %96.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %99.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %77, 0, 4611686018427387904, 1, %95.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+    %103:vgpr_32 = V_ADD_U32_sdwa 0, %2, 0, %18, 0, 6, 0, 6, 0, implicit $exec
+    %104:vgpr_32 = V_LSHL_ADD_U32_e64 %103, 4, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %104, %102, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %104, %101, 672, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %104, %100, 1344, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %104, %99, 2016, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %104, %98, 2688, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %104, %97, 3360, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %104, %96, 4032, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %104, %95, 4704, 0, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %0, implicit %1
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 225e8773791e6..5d358f9e4f6ce 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2948,18 +2948,19 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s42
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s62
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s43
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s41
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s59
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s40
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s61
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s60
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s41
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s59
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s58
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s39
@@ -3094,95 +3095,95 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_and_b32 s50, s50, 0xffff
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xa0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s47
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s49
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x80
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x70
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s42
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s60
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s43
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s59
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[27:28], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s47
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s45
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s40
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s41
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s38
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s58
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s57
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s56
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s39
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s19
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s6
 ; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x50
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s37
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s14
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s12
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s34
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s33
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
@@ -3682,18 +3683,19 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s61
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s62
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s23
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s59
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s60
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s59
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s58
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
@@ -3806,72 +3808,82 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_ashr_i32 s68, s50, 16
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s3
+; GCN-HSA-NEXT:    s_sext_i32_i16 s47, s47
+; GCN-HSA-NEXT:    s_sext_i32_i16 s46, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s47
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s49, s49
-; GCN-HSA-NEXT:    s_sext_i32_i16 s48, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[8:11]
+; GCN-HSA-NEXT:    s_sext_i32_i16 s45, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s3
+; GCN-HSA-NEXT:    s_sext_i32_i16 s44, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s49
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[12:15]
+; GCN-HSA-NEXT:    s_sext_i32_i16 s51, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s3
+; GCN-HSA-NEXT:    s_sext_i32_i16 s50, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s43, s43
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s42, s42
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s42
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s60
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s43
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s59
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s51, s51
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[27:28], v[16:19]
+; GCN-HSA-NEXT:    s_sext_i32_i16 s49, s49
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_sext_i32_i16 s50, s50
+; GCN-HSA-NEXT:    s_sext_i32_i16 s48, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s36, s36
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s39, s39
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s38, s38
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s41, s41
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s40, s40
-; GCN-HSA-NEXT:    s_sext_i32_i16 s45, s45
-; GCN-HSA-NEXT:    s_sext_i32_i16 s44, s44
-; GCN-HSA-NEXT:    s_sext_i32_i16 s47, s47
-; GCN-HSA-NEXT:    s_sext_i32_i16 s46, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s29, s29
@@ -3879,43 +3891,33 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s31, s31
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s30, s30
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s37, s37
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s47
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s45
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s40
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s41
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s38
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s58
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s57
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s56
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s39
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s54
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s37
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s30
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s53
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s28
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s34
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s33
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s29
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s27, s27
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s26, s26
@@ -6578,16 +6580,16 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s15
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, s13
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s36, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s40, s9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s44, s7
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s46, s5
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s38, s3
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s42, s1
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s40, s11
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s42, s9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s46, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s44, s5
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s36, s3
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s38, s1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s14, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s12, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s8, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[48:49], s[20:21], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[50:51], s[18:19], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s6, 16
@@ -6596,8 +6598,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s0, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[4:5], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[6:7], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[60:61], s[8:9], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[62:63], s[10:11], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[64:65], s[12:13], 0x100000
@@ -6623,50 +6625,50 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[46:47], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[44:45], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[42:43], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[44:45], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s37
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s40
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s41
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[58:59], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[56:57], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[54:55], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[52:53], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[30:31], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[28:29], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s37
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s70
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s71
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s41
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s68
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -6679,14 +6681,14 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s63
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s60
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s25
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, s20
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
@@ -6721,21 +6723,21 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_mov_b32 s42, s15
-; GCN-HSA-NEXT:    s_mov_b32 s44, s13
-; GCN-HSA-NEXT:    s_mov_b32 s46, s11
-; GCN-HSA-NEXT:    s_mov_b32 s48, s9
-; GCN-HSA-NEXT:    s_mov_b32 s50, s7
-; GCN-HSA-NEXT:    s_mov_b32 s52, s5
-; GCN-HSA-NEXT:    s_mov_b32 s54, s3
-; GCN-HSA-NEXT:    s_mov_b32 s56, s1
-; GCN-HSA-NEXT:    s_lshr_b32 s58, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s60, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s62, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s64, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s66, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s68, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s70, s2, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s72, s0, 16
+; GCN-HSA-NEXT:    s_mov_b32 s48, s13
+; GCN-HSA-NEXT:    s_mov_b32 s50, s11
+; GCN-HSA-NEXT:    s_mov_b32 s52, s9
+; GCN-HSA-NEXT:    s_mov_b32 s54, s7
+; GCN-HSA-NEXT:    s_mov_b32 s56, s5
+; GCN-HSA-NEXT:    s_mov_b32 s44, s3
+; GCN-HSA-NEXT:    s_mov_b32 s58, s1
+; GCN-HSA-NEXT:    s_lshr_b32 s60, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s62, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s64, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s66, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s68, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s70, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s72, s2, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s74, s0, 16
 ; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[36:37], s[0:1], 48
@@ -6749,7 +6751,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[40:41], s[4:5], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[74:75], s[6:7], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[46:47], s[6:7], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[76:77], s[8:9], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[78:79], s[10:11], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[80:81], s[12:13], 48
@@ -6757,94 +6759,95 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-HSA-NEXT:    s_bfe_i64 s[0:1], s[72:73], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[70:71], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[68:69], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[66:67], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[64:65], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[62:63], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[60:61], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[58:59], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[56:57], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[0:1], s[74:75], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[72:73], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[70:71], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[68:69], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[66:67], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[64:65], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[62:63], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[60:61], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[58:59], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-HSA-NEXT:    s_add_u32 s56, s16, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s57, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s44
-; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0xd0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s45
-; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s45
-; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s45
-; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0x90
-; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s45
-; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
-; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    s_add_u32 s58, s16, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s59, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
+; GCN-HSA-NEXT:    s_add_u32 s48, s16, 0xd0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s49
+; GCN-HSA-NEXT:    s_addc_u32 s49, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s49
+; GCN-HSA-NEXT:    s_add_u32 s48, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s49, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s58
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s46
+; GCN-HSA-NEXT:    s_add_u32 s46, s16, 0x90
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s59
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s47
+; GCN-HSA-NEXT:    s_addc_u32 s47, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s40
-; GCN-HSA-NEXT:    s_add_u32 s40, s16, 0x50
+; GCN-HSA-NEXT:    s_add_u32 s40, s16, 0x70
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s41
 ; GCN-HSA-NEXT:    s_addc_u32 s41, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s41
+; GCN-HSA-NEXT:    s_add_u32 s40, s16, 0x50
+; GCN-HSA-NEXT:    s_addc_u32 s41, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s80
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s81
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s38
 ; GCN-HSA-NEXT:    s_add_u32 s38, s16, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s78
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s79
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s49
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
 ; GCN-HSA-NEXT:    s_addc_u32 s39, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s40
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s39
 ; GCN-HSA-NEXT:    s_add_u32 s38, s16, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s47
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s78
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s79
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s56
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s57
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s41
 ; GCN-HSA-NEXT:    s_addc_u32 s39, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s46
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[20:23]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s14
 ; GCN-HSA-NEXT:    s_add_u32 s14, s16, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s53
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s76
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s77
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s47
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s15
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s44
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s54
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s12
 ; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xc0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s74
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s75
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s45
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s55
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s13
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s52
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s53
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s54
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s55
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s41
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s38
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s42
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s43
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s39
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s36
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s37
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s34
@@ -6852,10 +6855,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s30
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s14
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s12
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index b8b7f77bf52ff..8eeee8d653afb 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3484,8 +3484,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v15
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
@@ -3503,99 +3503,88 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v19
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v18
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v19
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v18
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v17
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v26
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v25
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v24
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v27
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v26
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v25
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v24
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xffff, v21
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v20
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xffff, v19
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v18
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v31
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v30
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v28
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v31
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v30
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v33, 0xffff, v29
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, 0xffff, v28
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v24
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v22
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v25
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v24
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[38:41], off, s[8:11], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v36, 0xffff, v23
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v22
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v41
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v40
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v39
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v38
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v37
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v35
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, 0xffff, v41
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v41, 0xffff, v40
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v47, 0xffff, v39
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, 0xffff, v38
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, 0xffff, v37
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, 0xffff, v36
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v47, 0xffff, v35
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v42
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v40
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v39
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v37, 0xffff, v42
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v35, 0xffff, v41
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, 0xffff, v40
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v51, 0xffff, v39
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v22
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v51, 0xffff, v23
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, 0xffff, v22
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v55, 0xffff, v21
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, 0xffff, v20
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[57:60], off, s[8:11], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v56
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, 0xffff, v58
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, 0xffff, v57
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v56
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v55
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v25
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v24
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v23
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v22
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v42
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v41
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v39
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v42
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v41
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v40
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v39
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v60
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v59
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v58
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v60
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v59
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v62, 0xffff, v58
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, 0xffff, v57
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
@@ -3617,132 +3606,132 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s8, s2, 48
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 64
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s8
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[32:33]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s4
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[0:1]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v30, 0xffff, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v28, 0xffff, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xf0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xc0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xd0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v30, 0xffff, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v28, 0xffff, v14
+; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xa0
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[24:27]
-; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xa0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
-; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[28:31]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s9
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[11:14]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v6
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[7:10]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v14
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v13
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v14
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s3
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v17
 ; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
 ; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v19
@@ -3759,23 +3748,24 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v33
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
 ; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v33
 ; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v32
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v21
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v20
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v25
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v24
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v23
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v22
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v26
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v27
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v26
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
@@ -3787,18 +3777,18 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v29
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v28
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v29
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v28
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v21
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v31
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v30
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v23
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v22
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -3833,102 +3823,93 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v5, 0xffff, v13
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xffff, v12
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v13
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v12
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v19
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v18
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xffff, v17
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, 0xffff, v17
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v16
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v23
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v22
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, 0xffff, v21
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, 0xffff, v20
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v38
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v37
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v35
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v45, 0xffff, v38
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v43, 0xffff, v37
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v49, 0xffff, v36
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v47, 0xffff, v35
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v23
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v21
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, 0xffff, v23
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v40, 0xffff, v22
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v46, 0xffff, v21
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, 0xffff, v20
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v42
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v40
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v39
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v37, 0xffff, v42
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v35, 0xffff, v41
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v53, 0xffff, v40
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v51, 0xffff, v39
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v26
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v25
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v24
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v27
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v26
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, 0xffff, v25
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v24
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v31
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v30
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v28
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, 0xffff, v31
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v30
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v33, 0xffff, v29
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v31, 0xffff, v28
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v39
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v38
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v37
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v36
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, 0xffff, v39
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v38
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, 0xffff, v37
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, 0xffff, v36
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[52:55], off, s[8:11], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v42
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v39
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v56
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v56
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v41
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v40
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v39
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v61, 0xffff, v42
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v59, 0xffff, v41
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v40
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v39
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v58
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v57
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v41, 0xffff, v58
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v39, 0xffff, v57
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v53
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v52
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v53
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v52
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v38
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v37
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v61, 16, v36
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v58, 0xffff, v39
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v56, 0xffff, v38
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v62, 0xffff, v37
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v60, 0xffff, v36
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v55
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v54
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, 0xffff, v55
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, 0xffff, v54
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
@@ -4432,8 +4413,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
@@ -4447,93 +4428,93 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
-; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[32:33]
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v13
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v12
 ; GCN-HSA-NEXT:    v_bfe_i32 v26, v13, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v24, v12, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[12:13]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[32:33]
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[24:27]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[24:27]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v15
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v14
 ; GCN-HSA-NEXT:    v_bfe_i32 v26, v15, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v24, v14, 0, 16
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[24:27]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v8
 ; GCN-HSA-NEXT:    v_bfe_i32 v14, v9, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v12, v8, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
 ; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[11:14]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[11:14]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v39, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v38, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v4
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v5, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v4, 0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v6
-; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[7:10]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v6
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v7, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v6, 0, 16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
 ; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[12:15]
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[8:11]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[8:11]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
@@ -4570,20 +4551,20 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v33
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v32
-; GCN-HSA-NEXT:    v_bfe_i32 v14, v33, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v12, v32, 0, 16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v29
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v28
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v29, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v28, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v35
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v34
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v35, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v34, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v30
+; GCN-HSA-NEXT:    v_bfe_i32 v10, v31, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v30, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v21
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v20
@@ -4593,18 +4574,19 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v29
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v28
-; GCN-HSA-NEXT:    v_bfe_i32 v6, v29, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v28, 0, 16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(14)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v33
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v32
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v33, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v32, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v31
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v35
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v31, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v30, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v34
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v35, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v34, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -6527,51 +6509,50 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, 0xffff, v6
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, 0xffff, v5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v20
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v27, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v21
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
@@ -6669,10 +6650,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index b49e39e1356b1..e88b23bcbbe96 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -1,5 +1,5 @@
-# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
-# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
 # REQUIRES: asserts
 
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 3acfb90cf8937..c8d1275f56c96 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
 
 # Check that %3 was not rematerialized before the last store since its operand %1
 # is killed by that store.

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 7dffec2058c82..37c3127b624ab 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -452,92 +452,101 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0
-; GFX8-NEXT:    s_movk_i32 s0, 0x7f
+; GFX8-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
-; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:  .LBB1_2: ; %for.body
 ; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xffffb000, v4
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, -1, v5, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[0:1], vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xffffb800, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, -1, v5, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT:    s_mov_b64 s[2:3], vcc
+; GFX8-NEXT:    v_addc_u32_e64 v9, vcc, -1, v5, s[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0xffffc000, v4
-; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, -1, v5, vcc
+; GFX8-NEXT:    s_mov_b64 s[0:1], vcc
+; GFX8-NEXT:    v_addc_u32_e64 v11, vcc, -1, v5, s[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xffffc800, v4
-; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, -1, v5, vcc
-; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xffffd000, v4
+; GFX8-NEXT:    s_mov_b64 s[2:3], vcc
+; GFX8-NEXT:    v_addc_u32_e64 v13, vcc, -1, v5, s[0:1]
+; GFX8-NEXT:    s_addk_i32 s5, 0x2000
+; GFX8-NEXT:    s_cmp_gt_u32 s5, 0x3fffff
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v8, v6
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[12:13]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffd000, v4
+; GFX8-NEXT:    s_mov_b64 s[0:1], vcc
+; GFX8-NEXT:    v_addc_u32_e64 v15, vcc, -1, v5, s[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[14:15]
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v10, v16
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v11, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xffffd800, v4
+; GFX8-NEXT:    s_mov_b64 s[2:3], vcc
+; GFX8-NEXT:    v_addc_u32_e64 v7, vcc, -1, v5, s[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v8, v16
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xffffe000, v4
+; GFX8-NEXT:    s_mov_b64 s[0:1], vcc
+; GFX8-NEXT:    v_addc_u32_e64 v11, vcc, -1, v5, s[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v12, v14
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, v13, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0xffffe800, v4
+; GFX8-NEXT:    s_mov_b64 s[2:3], vcc
+; GFX8-NEXT:    v_addc_u32_e64 v9, vcc, -1, v5, s[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v6, v14
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v13, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff000, v4
+; GFX8-NEXT:    s_mov_b64 s[0:1], vcc
+; GFX8-NEXT:    v_addc_u32_e64 v13, vcc, -1, v5, s[2:3]
 ; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
-; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[14:15]
-; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, -1, v5, vcc
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xffffd800, v4
-; GFX8-NEXT:    v_addc_u32_e32 v19, vcc, -1, v5, vcc
-; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffe000, v4
-; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[16:17]
-; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v5, vcc
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe800, v4
-; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v5, vcc
-; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xfffff000, v4
-; GFX8-NEXT:    flat_load_dwordx2 v[20:21], v[20:21]
-; GFX8-NEXT:    flat_load_dwordx2 v[22:23], v[22:23]
-; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, -1, v5, vcc
-; GFX8-NEXT:    s_addk_i32 s1, 0x2000
-; GFX8-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
-; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v8, v6
-; GFX8-NEXT:    v_addc_u32_e32 v27, vcc, v9, v7, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[24:25]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v4
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v5, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v10, v14
+; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, v11, v7, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v7, s[0:1], -1, v5, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xfffff800, v4
 ; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
-; GFX8-NEXT:    flat_load_dwordx2 v[24:25], v[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, -1, v5, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v8, v14
+; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, v9, v15, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x10000, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(9)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v26
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v11, v27, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(8)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v12, v10
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v13, v11, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v14, v10
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v15, v11, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v16, v10
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v17, v11, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v18, v10
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v19, v11, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v20, v10
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v21, v11, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v22, v10
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v23, v11, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v14
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, v13, v15, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v12
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v13, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v10, v6
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v11, v7, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v24, v6
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v25, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT:    s_add_i32 s1, s0, -1
-; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    s_add_i32 s0, s4, -1
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB1_5
 ; GFX8-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT:    s_mov_b32 s0, s1
+; GFX8-NEXT:    s_mov_b32 s4, s0
 ; GFX8-NEXT:    s_branch .LBB1_1
 ; GFX8-NEXT:  .LBB1_5: ; %while.end
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
@@ -593,62 +602,63 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
 ; GFX900-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffb000, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v5, vcc
+; GFX900-NEXT:    s_mov_b64 s[0:1], vcc
+; GFX900-NEXT:    v_addc_co_u32_e64 v9, s[0:1], -1, v5, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:-4096
 ; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[4:5], off offset:-2048
 ; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v4
 ; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
 ; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v5, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
-; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, s2, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v5, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[22:23], v[14:15], off
-; GFX900-NEXT:    global_load_dwordx2 v[24:25], v[16:17], off offset:-2048
-; GFX900-NEXT:    v_add_co_u32_e32 v20, vcc, s3, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v5, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[20:21], off offset:-4096
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, s5, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v5, vcc
 ; GFX900-NEXT:    s_addk_i32 s6, 0x2000
 ; GFX900-NEXT:    s_cmp_gt_u32 s6, 0x3fffff
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add_co_u32_e64 v28, s[0:1], v8, v6
-; GFX900-NEXT:    v_addc_co_u32_e64 v29, s[0:1], v9, v7, s[0:1]
-; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[20:21], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[20:21], off
-; GFX900-NEXT:    s_nop 0
-; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[26:27], v[4:5], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v6
+; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v7, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[14:15], off offset:-2048
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v6, v8
+; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v9, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[14:15], off
+; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, s2, v4
+; GFX900-NEXT:    s_mov_b64 s[0:1], vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v7, v16
+; GFX900-NEXT:    v_addc_co_u32_e64 v7, s[0:1], -1, v5, s[0:1]
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[6:7], off offset:-2048
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v8, v9, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, s3, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v5, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v6, v14
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[8:9], off offset:-4096
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v6, v14
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[8:9], off offset:-2048
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v6, v14
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, s5, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v5, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[6:7], off offset:-2048
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v8, v14
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v9, v15, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, 0x10000, v4
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(7)
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v18, v28
-; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v19, v29, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v22, v14
-; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v23, v15, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v24, v14
-; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v25, v15, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v16, v14
-; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v17, v15, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v14
 ; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v15, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
-; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v20, v6
-; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v21, v7, vcc
 ; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
 ; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
 ; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v6
 ; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v7, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v26, v6
-; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v27, v7, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX900-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX900-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX900-NEXT:    ; in Loop: Header=BB1_1 Depth=1


        


More information about the llvm-commits mailing list