[llvm] 02e60f2 - [AMDGPU] Use max waves for scheduler's initial occupancy target

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 26 15:31:26 PDT 2021


Author: Austin Kerbow
Date: 2021-10-26T15:30:26-07:00
New Revision: 02e60f2e772575107832623eb7980a748570d3c7

URL: https://github.com/llvm/llvm-project/commit/02e60f2e772575107832623eb7980a748570d3c7
DIFF: https://github.com/llvm/llvm-project/commit/02e60f2e772575107832623eb7980a748570d3c7.diff

LOG: [AMDGPU] Use max waves for scheduler's initial occupancy target

The scheduler should set critical/excess register usage thresholds that
are guided by the maximum possible occupancy for the function. This
change is focused on setting proper lower bounds on register usage which
we would typically only see when a specific number of maximum waves is
requested with the "waves-per-eu" attribute, or by setting
"amdgpu-num-vgpr|sgpr" directly. This was broken previously. I have a
follow-on patch that will address issues with the scheduler not
targeting correct upper bounds on register usage which is typical with
launch bounds and min "waves-per-eu".

Changes by this patch:

Set the initial critical register usage thresholds to minimum values
that are determined by the maximum possible occupancy for the function,
or the number of allocatable registers, whichever is lower.

Avoid unisgned overflow if register limits are lower than the register
tracking "ErrorMargin", I.e. when using stress-regalloc=2.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D112373

Added: 
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll

Modified: 
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
    llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0212b8e17641..5209b1a5a70f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -26,32 +26,36 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
 
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
-
   MF = &DAG->MF;
 
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 
   // FIXME: This is also necessary, because some passes that run after
   // scheduling and before regalloc increase register pressure.
-  const int ErrorMargin = 3;
-
-  SGPRExcessLimit = Context->RegClassInfo
-    ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin;
-  VGPRExcessLimit = Context->RegClassInfo
-    ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin;
-  if (TargetOccupancy) {
-    SGPRCriticalLimit = ST.getMaxNumSGPRs(TargetOccupancy, true);
-    VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy);
-  } else {
-    SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
-        AMDGPU::RegisterPressureSets::SReg_32);
-    VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
-        AMDGPU::RegisterPressureSets::VGPR_32);
-  }
-
-  SGPRCriticalLimit -= ErrorMargin;
-  VGPRCriticalLimit -= ErrorMargin;
+  const unsigned ErrorMargin = 3;
+
+  SGPRExcessLimit =
+      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
+  VGPRExcessLimit =
+      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+
+  SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  // Set the initial TargetOccupnacy to the maximum occupancy that we can
+  // achieve for this function. This effectively sets a lower bound on the
+  // 'Critical' register limits in the scheduler.
+  TargetOccupancy = MFI.getOccupancy();
+  SGPRCriticalLimit =
+      std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
+  VGPRCriticalLimit =
+      std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
+
+  // Subtract error margin from register limits and avoid overflow.
+  SGPRCriticalLimit =
+      std::min(SGPRCriticalLimit - ErrorMargin, SGPRCriticalLimit);
+  VGPRCriticalLimit =
+      std::min(VGPRCriticalLimit - ErrorMargin, VGPRCriticalLimit);
+  SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit);
+  VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit);
 }
 
 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -361,14 +365,18 @@ void GCNScheduleDAGMILive::schedule() {
     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }
-  unsigned Occ = MFI.getOccupancy();
-  unsigned WavesAfter = std::min(Occ, PressureAfter.getOccupancy(ST));
-  unsigned WavesBefore = std::min(Occ, PressureBefore.getOccupancy(ST));
+
+  unsigned WavesAfter =
+      std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST));
+  unsigned WavesBefore =
+      std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST));
   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                     << ", after " << WavesAfter << ".\n");
 
-  // We could not keep current target occupancy because of the just scheduled
-  // region. Record new occupancy for next scheduling cycle.
+  // We may not be able to keep the current target occupancy because of the just
+  // scheduled region. We might still be able to revert scheduling if the
+  // occupancy before was higher, or if the current schedule has register
+  // pressure higher than the excess limits which could lead to more spilling.
   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
   // Allow memory bound functions to drop to 4 waves if not limited by an
   // attribute.
@@ -378,6 +386,7 @@ void GCNScheduleDAGMILive::schedule() {
                       << MFI.getMinAllowedOccupancy() << " waves\n");
     NewOccupancy = WavesAfter;
   }
+
   if (NewOccupancy < MinOccupancy) {
     MinOccupancy = NewOccupancy;
     MFI.limitOccupancy(MinOccupancy);
@@ -394,6 +403,11 @@ void GCNScheduleDAGMILive::schedule() {
     RegionsWithHighRP[RegionIdx] = true;
   }
 
+  // If this condition is true, then either the occupancy before and after
+  // scheduling is the same, or we are allowing the occupancy to drop because
+  // the function is memory bound. Even if we are OK with the current occupancy,
+  // we still need to verify that we will not introduce any extra chance of
+  // spilling.
   if (WavesAfter >= MinOccupancy) {
     if (Stage == UnclusteredReschedule &&
         !PressureAfter.less(ST, PressureBefore)) {
@@ -540,7 +554,6 @@ GCNScheduleDAGMILive::getBBLiveInMap() const {
 }
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
-  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
   LiveIns.resize(Regions.size());
@@ -586,8 +599,6 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
             dbgs()
             << "Retrying function scheduling with lowest recorded occupancy "
             << MinOccupancy << ".\n");
-
-        S.setTargetOccupancy(MinOccupancy);
       }
     }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index c3c04b4729c3..69ccdb450fc7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3477,109 +3477,139 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v0
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, s0, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, s0, v2
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v32, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, 0xffff
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s0, v15
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s0, v14
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v33, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v34, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v35, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v38, s0, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v36, s0, v0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v5, s0, v13
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v42, s0, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v40, s0, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, s0, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, s0, v4
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v9
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, s0, v11
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s0, v10
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v46, s0, v9
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v44, s0, v8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v13
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, s0, v15
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s0, v14
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v50, s0, v13
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v48, s0, v12
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v17
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, s0, v19
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s0, v18
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v54, s0, v17
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v52, s0, v16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v21
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v20
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s0, v23
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s0, v22
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, s0, v21
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v56, s0, v20
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v25
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v24
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, s0, v27
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, s0, v26
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v62, s0, v25
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, s0, v24
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, s0, v12
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s0, v19
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, s0, v18
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, s0, v17
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s0, v16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v26
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v25
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v24
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s0, v27
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s0, v26
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, s0, v25
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, s0, v24
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v31
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v30
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v29
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v28
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, s0, v31
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, s0, v30
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s0, v29
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s0, v28
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v33, s0, v29
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, s0, v28
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v38
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v37
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v35
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, s0, v38
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, s0, v37
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, s0, v36
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v47, s0, v35
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v42
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v40
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v39
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v37, s0, v42
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v35, s0, v41
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, s0, v40
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v51, s0, v39
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v56
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, s0, v58
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, s0, v57
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, s0, v56
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s0, v55
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v42
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v41
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v39
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, s0, v42
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, s0, v41
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s0, v40
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s0, v39
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
@@ -3785,122 +3815,109 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s90, -1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s91, 0xe80000
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s88, s88, s3
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s89, s89, 0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s6
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s7
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, s2
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, s3
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v62, s4, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v61, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v60, s4, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, s4, v5
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s0, v19
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v18
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[56:59], off, s[8:11], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, s0, v17
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s0, v16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, s0, v23
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, s0, v22
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s4, v23
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s4, v22
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, s0, v21
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, s4, v21
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, s0, v20
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, s4, v20
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, s0, v27
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, s4, v27
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, s0, v26
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, s4, v26
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, s0, v25
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, s4, v25
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, s0, v24
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, s4, v24
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v31
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, s0, v31
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, s4, v31
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v30
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v40, s0, v30
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v40, s4, v30
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, s0, v29
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, s4, v29
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v28
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v46, s0, v15
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v14
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, s0, v14
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s4, v28
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v35
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v46, s4, v35
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v34
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, s4, v34
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v33
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, s4, v33
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, s4, v32
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, s4, v19
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, s4, v18
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, s4, v17
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s4, v16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v15
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, s4, v15
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v52, s4, v14
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, s0, v13
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, s4, v13
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s0, v12
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, s0, v11
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, s0, v10
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s4, v12
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v11
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v58, s4, v11
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v56, s4, v10
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, s0, v9
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, s4, v9
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s0, v8
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, s0, v7
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v52, s0, v6
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, s0, v5
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s0, v4
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v60, s0, v59
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v58
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v58, s0, v58
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v57
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s0, v57
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v56
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v56
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s4, v8
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -4265,38 +4282,40 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s3
-; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, -1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, 0xe8f000
+; GCN-NOHSA-SI-NEXT:    s_add_u32 s8, s8, s3
+; GCN-NOHSA-SI-NEXT:    s_addc_u32 s9, s9, 0
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, s2
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, s3
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v11, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v10, 0, 16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v9, 0, 16
@@ -4358,8 +4377,6 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v12
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v13, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v12, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
@@ -4375,10 +4392,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -4589,27 +4606,28 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s88, s88, s3
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s6
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s7
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, s2
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, s3
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v59, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v63, 16, v17
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v55, 16, v17
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v59, 16, v21
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v51, 16, v21
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v55, 16, v25
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
@@ -4620,22 +4638,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v8, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v27
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v26
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v27, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v26, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v39, 16, v25
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v37, 16, v24
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v38, v25, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v36, v24, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v27, 16, v31
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v31, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v24, v30, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v12
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v13, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v12, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v9, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v8, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v31
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v30
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v31, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v30, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v43, 16, v29
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v41, 16, v28
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v42, v29, 0, 16
@@ -4648,62 +4662,54 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v45, 16, v32
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v46, v33, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v44, v32, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v35, 16, v23
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v33, 16, v22
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v34, v23, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v32, v22, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v49, 16, v20
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v50, v21, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v48, v20, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v35, 16, v39
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v33, 16, v38
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v34, v39, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v32, v38, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v51, 16, v37
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v49, 16, v36
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v50, v37, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v48, v36, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v39, 16, v27
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v37, 16, v26
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v38, v27, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v36, v26, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v53, 16, v24
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v54, v25, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v52, v24, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v25, v23, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v23, v22, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v57, 16, v20
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v58, v21, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v56, v20, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v21, v19, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v18, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v53, 16, v16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v54, v17, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v52, v16, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v61, 16, v16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v62, v17, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v60, v16, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 16, v15
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v15, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v14, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v57, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v58, v13, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v56, v12, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[60:63], off, s[8:11], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v61
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v60
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v61, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v60, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v63
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v62
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v63, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v62, 0, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
index 8ad70cf95c09..e5f08dbedce3 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
@@ -1,8 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-amdgpu-aa=0 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-amdgpu-aa=0 -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s
+
+; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP.
 
 ; We expect a three digit VGPR usage here since only one wave requested.
-; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
+;
+; GCN-ILP: NumVgprs: {{[0-9][0-9][0-9]$}}
+
+; FIXME: The machine scheduler is doing a poor job at maximizing ILP here.
+; However, if we had not requested only one wave register usage would indeed be
+; much lower, demonstrating that is the purpose of this test.
+;
+; MISCHED: NumVgprs: {{[7-9][0-9]$}}
 
 define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) #1 {
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
new file mode 100644
index 000000000000..98025b184f20
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
@@ -0,0 +1,110 @@
+; REQUIRES: asserts
+
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -debug-only=machine-scheduler -o /dev/null < %s 2>&1 | FileCheck %s
+
+; We are only targeting one wave. Check that the machine scheduler doesn't use
+; register pressure heuristics to prioritize any candidate instruction.
+
+; CHECK-NOT: REG-CRIT
+; CHECK-NOT: REG-EXCESS
+
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(3)* nocapture %arg1) #1 {
+bb:
+  %tmp0 = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
+  %tmp1 = load float, float addrspace(3)* %tmp0, align 4
+  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
+  %tmp3 = load float, float addrspace(3)* %tmp2, align 4
+  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
+  %tmp5 = load float, float addrspace(3)* %tmp4, align 4
+  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4
+  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
+  %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
+  %tmp9 = load float, float addrspace(3)* %tmp8, align 4
+  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
+  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
+  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
+  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp14 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8
+  %tmp15 = load float, float addrspace(3)* %tmp14, align 4
+  %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
+  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
+  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
+  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
+  %tmp20 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
+  %tmp21 = load float, float addrspace(3)* %tmp20, align 4
+  %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12
+  %tmp23 = load float, float addrspace(3)* %tmp22, align 4
+  %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
+  %tmp25 = load float, float addrspace(3)* %tmp24, align 4
+  %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
+  %tmp27 = load float, float addrspace(3)* %tmp26, align 4
+  %tmp28 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
+  %tmp29 = load float, float addrspace(3)* %tmp28, align 4
+  %tmp30 = getelementptr inbounds float, float addrspace(3)* %arg, i32 16
+  %tmp31 = load float, float addrspace(3)* %tmp30, align 4
+  %tmp32 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
+  %tmp33 = load float, float addrspace(3)* %tmp32, align 4
+  %tmp34 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
+  %tmp35 = load float, float addrspace(3)* %tmp34, align 4
+  %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
+  %tmp37 = load float, float addrspace(3)* %tmp36, align 4
+  %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 20
+  %tmp39 = load float, float addrspace(3)* %tmp38, align 4
+  %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
+  %tmp41 = load float, float addrspace(3)* %tmp40, align 4
+  %tmp42 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
+  %tmp43 = load float, float addrspace(3)* %tmp42, align 4
+  %tmp44 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
+  %tmp45 = load float, float addrspace(3)* %tmp44, align 4
+  %tmp46 = getelementptr inbounds float, float addrspace(3)* %arg, i32 24
+  %tmp47 = load float, float addrspace(3)* %tmp46, align 4
+  %tmp48 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
+  %tmp49 = load float, float addrspace(3)* %tmp48, align 4
+  %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
+  %tmp51 = load float, float addrspace(3)* %tmp50, align 4
+  %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
+  %tmp53 = load float, float addrspace(3)* %tmp52, align 4
+  %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 28
+  %tmp55 = load float, float addrspace(3)* %tmp54, align 4
+  %tmp56 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
+  %tmp57 = load float, float addrspace(3)* %tmp56, align 4
+  %tmp58 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
+  %tmp59 = load float, float addrspace(3)* %tmp58, align 4
+  %tmp60 = tail call float @llvm.fmuladd.f32(float %tmp1, float %tmp3, float %tmp5)
+  %tmp61 = tail call float @llvm.fmuladd.f32(float %tmp7, float %tmp9, float %tmp11)
+  %tmp62 = tail call float @llvm.fmuladd.f32(float %tmp13, float %tmp15, float %tmp17)
+  %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp19, float %tmp21, float %tmp23)
+  %tmp64 = tail call float @llvm.fmuladd.f32(float %tmp25, float %tmp27, float %tmp29)
+  %tmp65 = tail call float @llvm.fmuladd.f32(float %tmp31, float %tmp33, float %tmp35)
+  %tmp66 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
+  %tmp67 = tail call float @llvm.fmuladd.f32(float %tmp43, float %tmp45, float %tmp47)
+  %tmp68 = tail call float @llvm.fmuladd.f32(float %tmp49, float %tmp51, float %tmp53)
+  %tmp69 = tail call float @llvm.fmuladd.f32(float %tmp55, float %tmp57, float %tmp59)
+  %tmp70 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 1
+  store float %tmp60, float addrspace(3)* %tmp70, align 4
+  %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 2
+  store float %tmp61, float addrspace(3)* %tmp71, align 4
+  %tmp72 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 3
+  store float %tmp62, float addrspace(3)* %tmp72, align 4
+  %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 4
+  store float %tmp63, float addrspace(3)* %tmp73, align 4
+  %tmp74 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 5
+  store float %tmp64, float addrspace(3)* %tmp74, align 4
+  %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 6
+  store float %tmp65, float addrspace(3)* %tmp75, align 4
+  %tmp76 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 7
+  store float %tmp66, float addrspace(3)* %tmp76, align 4
+  %tmp77 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 8
+  store float %tmp67, float addrspace(3)* %tmp77, align 4
+  %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 9
+  store float %tmp68, float addrspace(3)* %tmp78, align 4
+  %tmp79 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 10
+  store float %tmp69, float addrspace(3)* %tmp79, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
index 29eac20536b0..639fb82faca1 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@@ -7,10 +7,10 @@
 
 # CHECK-LABEL: name: expecting_non_empty_interval
 
-# CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $mode, implicit $exec
-# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-# CHECK-NEXT: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
+# CHECK: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
 # CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $mode, implicit $exec
+# CHECK-NEXT: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $mode, implicit $exec
+# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
 
 # CHECK: S_NOP 0, implicit %6.sub1
 # CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
@@ -47,9 +47,9 @@ body:             |
 
 # CHECK: bb.1:
 # CHECK-NEXT: S_NOP 0, implicit %1.sub2
-# CHECK-NEXT: S_NOP 0, implicit undef %4.sub0
 # CHECK-NEXT: undef %2.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
 # CHECK-NEXT: S_NOP 0, implicit %2.sub2
+# CHECK-NEXT: S_NOP 0, implicit undef %4.sub0
 name: rematerialize_empty_interval_has_reference
 tracksRegLiveness: true
 machineFunctionInfo:


        


More information about the llvm-commits mailing list