[llvm] 113052b - [AMDGPU] Prefer lower total register usage in regions with spilling

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 26 12:21:53 PST 2024


Author: Jeffrey Byrnes
Date: 2024-02-26T12:19:52-08:00
New Revision: 113052b2b022c4ce45c8003057ae4297d48ed843

URL: https://github.com/llvm/llvm-project/commit/113052b2b022c4ce45c8003057ae4297d48ed843
DIFF: https://github.com/llvm/llvm-project/commit/113052b2b022c4ce45c8003057ae4297d48ed843.diff

LOG: [AMDGPU] Prefer lower total register usage in regions with spilling

Change-Id: Ia5c434b0945bdcbc357c5e06c3164118fc91df25

Added: 
    llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir

Modified: 
    llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
    llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
    llvm/lib/Target/AMDGPU/GCNRegPressure.h
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/test/CodeGen/AMDGPU/bf16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index cdc9de7f65e3e5..aebfe154b31395 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -409,9 +409,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 
 // Sort recorded regions by pressure - highest at the front
 void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
-  llvm::sort(Regions, [&ST, TargetOcc](const Region *R1, const Region *R2) {
-    return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+  llvm::sort(Regions, [this, TargetOcc](const Region *R1, const Region *R2) {
+    return R2->MaxPressure.less(MF, R1->MaxPressure, TargetOcc);
   });
 }
 
@@ -517,26 +516,25 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
 // Minimal Register Strategy
 
 void GCNIterativeScheduler::scheduleMinReg(bool force) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const auto TgtOcc = MFI->getOccupancy();
   sortRegionsByPressure(TgtOcc);
 
   auto MaxPressure = Regions.front()->MaxPressure;
   for (auto *R : Regions) {
-    if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+    if (!force && R->MaxPressure.less(MF, MaxPressure, TgtOcc))
       break;
 
     BuildDAG DAG(*R, *this);
     const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
 
     const auto RP = getSchedulePressure(*R, MinSchedule);
-    LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+    LLVM_DEBUG(if (R->MaxPressure.less(MF, RP, TgtOcc)) {
       dbgs() << "\nWarning: Pressure becomes worse after minreg!";
       printSchedRP(dbgs(), R->MaxPressure, RP);
     });
 
-    if (!force && MaxPressure.less(ST, RP, TgtOcc))
+    if (!force && MaxPressure.less(MF, RP, TgtOcc))
       break;
 
     scheduleRegion(*R, MinSchedule, RP);

diff  --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index fd8f0bebd3bec3..5c394e6d6296d0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -88,9 +88,10 @@ void GCNRegPressure::inc(unsigned Reg,
   }
 }
 
-bool GCNRegPressure::less(const GCNSubtarget &ST,
-                          const GCNRegPressure& O,
+bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
   const auto SGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(getSGPRNum()));
   const auto VGPROcc =
@@ -104,18 +105,103 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+
+  // Give first precedence to the better occupancy.
   if (Occ != OtherOcc)
     return Occ > OtherOcc;
 
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+
+  // SGPR excess pressure conditions
+  unsigned ExcessSGPR = std::max(static_cast<int>(getSGPRNum() - MaxSGPRs), 0);
+  unsigned OtherExcessSGPR =
+      std::max(static_cast<int>(O.getSGPRNum() - MaxSGPRs), 0);
+
+  auto WaveSize = ST.getWavefrontSize();
+  // The number of virtual VGPRs required to handle excess SGPR
+  unsigned VGPRForSGPRSpills = (ExcessSGPR + (WaveSize - 1)) / WaveSize;
+  unsigned OtherVGPRForSGPRSpills =
+      (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
+
+  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+
+  // Unified excess pressure conditions, accounting for VGPRs used for SGPR
+  // spills
+  unsigned ExcessVGPR =
+      std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
+                                VGPRForSGPRSpills - MaxVGPRs),
+               0);
+  unsigned OtherExcessVGPR =
+      std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
+                                OtherVGPRForSGPRSpills - MaxVGPRs),
+               0);
+  // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
+  // spills
+  unsigned ExcessArchVGPR = std::max(
+      static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
+      0);
+  unsigned OtherExcessArchVGPR =
+      std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
+                                MaxArchVGPRs),
+               0);
+  // AGPR excess pressure conditions
+  unsigned ExcessAGPR = std::max(
+      static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
+                                           : (getAGPRNum() - MaxVGPRs)),
+      0);
+  unsigned OtherExcessAGPR = std::max(
+      static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
+                                           : (O.getAGPRNum() - MaxVGPRs)),
+      0);
+
+  bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
+  bool OtherExcessRP = OtherExcessSGPR || OtherExcessVGPR ||
+                       OtherExcessArchVGPR || OtherExcessAGPR;
+
+  // Give second precedence to the reduced number of spills to hold the register
+  // pressure.
+  if (ExcessRP || OtherExcessRP) {
+    // The 
diff erence in excess VGPR pressure, after including VGPRs used for
+    // SGPR spills
+    int VGPRDiff = ((OtherExcessVGPR + OtherExcessArchVGPR + OtherExcessAGPR) -
+                    (ExcessVGPR + ExcessArchVGPR + ExcessAGPR));
+
+    int SGPRDiff = OtherExcessSGPR - ExcessSGPR;
+
+    if (VGPRDiff != 0)
+      return VGPRDiff > 0;
+    if (SGPRDiff != 0) {
+      unsigned PureExcessVGPR =
+          std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+                   0) +
+          std::max(static_cast<int>(getVGPRNum(false) - MaxArchVGPRs), 0);
+      unsigned OtherPureExcessVGPR =
+          std::max(
+              static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+              0) +
+          std::max(static_cast<int>(O.getVGPRNum(false) - MaxArchVGPRs), 0);
+
+      // If we have a special case where there is a tie in excess VGPR, but one
+      // of the pressures has VGPR usage from SGPR spills, prefer the pressure
+      // with SGPR spills.
+      if (PureExcessVGPR != OtherPureExcessVGPR)
+        return SGPRDiff < 0;
+      // If both pressures have the same excess pressure before and after
+      // accounting for SGPR spills, prefer fewer SGPR spills.
+      return SGPRDiff > 0;
+    }
+  }
+
   bool SGPRImportant = SGPROcc < VGPROcc;
   const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
 
-  // if both pressures disagree on what is more important compare vgprs
+  // If both pressures disagree on what is more important compare vgprs.
   if (SGPRImportant != OtherSGPRImportant) {
     SGPRImportant = false;
   }
 
-  // compare large regs pressure
+  // Give third precedence to lower register tuple pressure.
   bool SGPRFirst = SGPRImportant;
   for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
     if (SGPRFirst) {
@@ -130,6 +216,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
         return VW < OtherVW;
     }
   }
+
+  // Give final precedence to lower general RP.
   return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
                          (getVGPRNum(ST.hasGFX90AInsts()) <
                           O.getVGPRNum(ST.hasGFX90AInsts()));

diff  --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 4100970fe1a962..752f53752fa68b 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -74,8 +74,20 @@ struct GCNRegPressure {
     return getOccupancy(ST) > O.getOccupancy(ST);
   }
 
-  bool less(const GCNSubtarget &ST, const GCNRegPressure& O,
-    unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+  /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
+  /// less. Since GCNRegpressure contains 
diff erent types of pressures, and due
+  /// to target-specific pecularities (e.g. we care about occupancy rather than
+  /// raw register usage), we determine if \p this GCNRegPressure is less than
+  /// \p O based on the following tiered comparisons (in order order of
+  /// precedence):
+  /// 1. Better occupancy
+  /// 2. Less spilling (first preference to VGPR spills, then to SGPR spills)
+  /// 3. Less tuple register pressure (first preference to VGPR tuples if we
+  /// determine that SGPR pressure is not important)
+  /// 4. Less raw register pressure (first preference to VGPR tuples if we
+  /// determine that SGPR pressure is not important)
+  bool less(const MachineFunction &MF, const GCNRegPressure &O,
+            unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
 
   bool operator==(const GCNRegPressure &O) const {
     return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);

diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 3f3550d3029aad..f993ec8409c997 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -977,6 +977,7 @@ void GCNSchedStage::checkScheduling() {
 
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+
   if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
       PressureAfter.getAGPRNum() > MaxVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
@@ -1199,9 +1200,8 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 }
 
 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
-  if (WavesAfter <= MFI.getMinWavesPerEU() &&
-      !PressureAfter.less(ST, PressureBefore) &&
-      isRegionWithExcessRP()) {
+  if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
+      !PressureAfter.less(MF, PressureBefore)) {
     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     return true;
   }

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 46fceb8ae22c18..a933c16b6ed516 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1382,6 +1382,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
   }
 
+  /// \returns Addressable number of architectural VGPRs supported by the
+  /// subtarget.
+  unsigned getAddressableNumArchVGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
+  }
+
   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs() const {
     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ce91e05e5cc810..177d99a0ac0abe 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1107,10 +1107,12 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
   return IsWave32 ? 1024 : 512;
 }
 
+unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
+
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
     return 512;
-  return 256;
+  return getAddressableNumArchVGPRs(STI);
 }
 
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6826cd27319507..9a6d0834679eae 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -295,6 +295,10 @@ unsigned getVGPREncodingGranule(
 /// \returns Total number of VGPRs for given subtarget \p STI.
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
 
+/// \returns Addressable number of architectural VGPRs for a given subtarget \p
+/// STI.
+unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
+
 /// \returns Addressable number of VGPRs for given subtarget \p STI.
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
 

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 8ec7dfd93cd098..c773742a459507 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -37615,266 +37615,283 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-LABEL: v_vselect_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v36, 1, v13
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:48
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:176
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
 ; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
 ; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192
+; GCN-NEXT:    v_and_b32_e32 v53, 1, v26
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GCN-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GCN-NEXT:    v_and_b32_e32 v30, 1, v30
-; GCN-NEXT:    v_and_b32_e32 v48, 1, v28
-; GCN-NEXT:    v_and_b32_e32 v50, 1, v27
-; GCN-NEXT:    v_and_b32_e32 v52, 1, v26
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:92
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:220
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:96
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:224
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:104
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:232
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:240
-; GCN-NEXT:    s_waitcnt expcnt(6)
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt expcnt(5)
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:248
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124
-; GCN-NEXT:    s_waitcnt expcnt(4)
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:252
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:128
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:256
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:252
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:248
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:244
+; GCN-NEXT:    s_waitcnt expcnt(6)
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:240
 ; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v37
-; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v38
+; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v37
+; GCN-NEXT:    v_mul_f32_e32 v38, 1.0, v38
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v36
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v43
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v56
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v44
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v30
 ; GCN-NEXT:    v_cndmask_b32_e64 v30, v37, v36, s[4:5]
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:116
-; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:244
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:100
-; GCN-NEXT:    s_waitcnt expcnt(3)
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:236
+; GCN-NEXT:    s_waitcnt expcnt(5)
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:232
+; GCN-NEXT:    s_waitcnt expcnt(4)
 ; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:228
+; GCN-NEXT:    s_waitcnt expcnt(3)
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:224
 ; GCN-NEXT:    s_waitcnt expcnt(2)
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:220
 ; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:212
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:216
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:68
-; GCN-NEXT:    v_mul_f32_e32 v38, 1.0, v46
-; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v47
-; GCN-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v36
-; GCN-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v37
-; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v44
-; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v45
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:212
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:128
+; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v45
+; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v46
 ; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
-; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v47
+; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v36
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
-; GCN-NEXT:    v_cndmask_b32_e64 v29, v46, v38, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v48
-; GCN-NEXT:    v_cndmask_b32_e64 v36, v37, v36, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v50
-; GCN-NEXT:    v_cndmask_b32_e64 v37, v45, v44, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v52
-; GCN-NEXT:    v_cndmask_b32_e64 v38, v40, v55, s[4:5]
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136
-; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:12
-; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:140
-; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:16
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:144
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:152
-; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
-; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
-; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
-; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_cndmask_b32_e64 v29, v43, v42, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v28
+; GCN-NEXT:    v_cndmask_b32_e64 v28, v44, v41, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v27
+; GCN-NEXT:    v_cndmask_b32_e64 v27, v45, v55, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v53
+; GCN-NEXT:    v_cndmask_b32_e64 v36, v36, v54, s[4:5]
+; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:132
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:8
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:136
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:144
+; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v22, 1, v22
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v56
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v57
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
 ; GCN-NEXT:    s_waitcnt vmcnt(12)
-; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v56
-; GCN-NEXT:    s_waitcnt vmcnt(11)
-; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v57
+; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v58
 ; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v59
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v25
-; GCN-NEXT:    v_cndmask_b32_e64 v25, v54, v53, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v25, v46, v52, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v24
-; GCN-NEXT:    v_cndmask_b32_e64 v24, v56, v47, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v24, v47, v51, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v23
-; GCN-NEXT:    v_cndmask_b32_e64 v23, v51, v49, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v23, v56, v50, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
-; GCN-NEXT:    v_cndmask_b32_e64 v22, v39, v28, s[4:5]
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:72
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:200
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:204
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:80
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:208
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:88
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:216
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_cndmask_b32_e64 v22, v57, v49, s[4:5]
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:196
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:200
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:204
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:208
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
-; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
+; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v60
+; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v61
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
-; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v58
-; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v59
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v21
-; GCN-NEXT:    v_cndmask_b32_e64 v21, v56, v47, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v21, v58, v48, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v20
-; GCN-NEXT:    v_cndmask_b32_e64 v20, v58, v57, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v20, v59, v39, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v19
-; GCN-NEXT:    v_cndmask_b32_e64 v19, v54, v53, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v19, v57, v56, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v18
-; GCN-NEXT:    v_cndmask_b32_e64 v18, v51, v49, s[4:5]
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:196
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:180
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:36
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:164
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:148
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
-; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v60
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
-; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    v_cndmask_b32_e64 v18, v47, v46, s[4:5]
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:148
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:152
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:156
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:160
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 1, v10
+; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
+; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v17
-; GCN-NEXT:    v_cndmask_b32_e64 v17, v39, v28, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v17, v52, v51, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; GCN-NEXT:    v_cndmask_b32_e64 v16, v49, v59, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v16, v50, v49, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v15
 ; GCN-NEXT:    v_cndmask_b32_e64 v15, v35, v34, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v14
 ; GCN-NEXT:    v_cndmask_b32_e64 v14, v33, v32, s[4:5]
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:28
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:156
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
-; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:160
-; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:40
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:168
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:168
 ; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:44
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:172
-; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:172
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:48
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:176
 ; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
-; GCN-NEXT:    v_and_b32_e32 v10, 1, v10
-; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
-; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v41, v42, v41, vcc
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132
-; GCN-NEXT:    v_and_b32_e32 v43, 1, v43
-; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
+; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v38, v38, v40, vcc
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:256
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
+; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
+; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
+; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v43
 ; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v44
 ; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v45
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
 ; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
-; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
+; GCN-NEXT:    s_waitcnt vmcnt(12)
+; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
+; GCN-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v58
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v59
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
+; GCN-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v59
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
-; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
-; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
-; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
 ; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
-; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
-; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
-; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
-; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v58
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v37
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GCN-NEXT:    v_cndmask_b32_e32 v12, v53, v51, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v12, v31, v13, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GCN-NEXT:    v_cndmask_b32_e32 v11, v31, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v11, v52, v51, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GCN-NEXT:    v_cndmask_b32_e32 v10, v59, v49, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v10, v50, v49, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v39, v35, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v9, v35, v34, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GCN-NEXT:    v_cndmask_b32_e32 v8, v47, v54, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v34, v33, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v59, v58, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v32, v28, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v57, v56, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v46, v45, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v47, v46, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v57, v56, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v48, v39, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v55, v52, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v45, v44, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v50, v48, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v43, v42, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v44, v40, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v41, v55, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v42, v58, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v43
-; GCN-NEXT:    v_cndmask_b32_e32 v31, v27, v26, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v54, v53, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v26
+; GCN-NEXT:    v_cndmask_b32_e32 v31, v40, v37, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -37888,7 +37905,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v41
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
@@ -37901,25 +37918,26 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v38
-; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v37
-; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v36
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v36
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
new file mode 100644
index 00000000000000..f50688240fe8bd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
@@ -0,0 +1,353 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @spill_regpressure_less() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+...
+
+---
+name:            spill_regpressure_less
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       8
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: spill_regpressure_less
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF40:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF41:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF42:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF43:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF44:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF45:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF46:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF47:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF48:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF49:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF50:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF51:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF52:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF53:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF54:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF55:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF56:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF57:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF58:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF59:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF60:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF61:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF62:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF63:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF64:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF65:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF66:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]], implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]], implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]], implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]]
+    ; GCN-NEXT: KILL [[DEF]]
+    ; GCN-NEXT: KILL [[DEF1]]
+    ; GCN-NEXT: KILL [[DEF10]]
+    ; GCN-NEXT: KILL [[DEF12]]
+    ; GCN-NEXT: KILL [[DEF13]]
+    ; GCN-NEXT: KILL [[DEF14]]
+    ; GCN-NEXT: KILL [[DEF15]]
+    ; GCN-NEXT: KILL [[DEF16]]
+    ; GCN-NEXT: [[DEF67:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF17]]
+    ; GCN-NEXT: [[DEF68:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF69:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF69]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]]
+    ; GCN-NEXT: KILL [[DEF2]]
+    ; GCN-NEXT: KILL [[DEF3]]
+    ; GCN-NEXT: KILL [[DEF4]]
+    ; GCN-NEXT: KILL [[DEF5]]
+    ; GCN-NEXT: KILL [[DEF6]]
+    ; GCN-NEXT: KILL [[DEF7]]
+    ; GCN-NEXT: KILL [[DEF8]]
+    ; GCN-NEXT: KILL [[DEF9]]
+    ; GCN-NEXT: KILL [[DEF18]]
+    ; GCN-NEXT: KILL [[DEF19]]
+    ; GCN-NEXT: [[DEF70:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF70]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
+    ; GCN-NEXT: KILL [[DEF69]], implicit-def %70, implicit-def %71, implicit-def %72, implicit-def %73, implicit-def %74, implicit-def %75, implicit-def %76, implicit-def %77
+    ; GCN-NEXT: [[DEF71:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF72:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF20]]
+    ; GCN-NEXT: [[DEF73:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF11]]
+    ; GCN-NEXT: [[DEF74:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF21]]
+    ; GCN-NEXT: [[DEF75:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF22]]
+    ; GCN-NEXT: [[DEF76:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF23]]
+    ; GCN-NEXT: KILL [[DEF24]]
+    ; GCN-NEXT: KILL [[DEF25]]
+    ; GCN-NEXT: KILL [[DEF26]]
+    ; GCN-NEXT: KILL [[DEF27]]
+    ; GCN-NEXT: KILL [[DEF28]]
+    ; GCN-NEXT: KILL [[DEF29]]
+    ; GCN-NEXT: KILL [[DEF30]]
+    ; GCN-NEXT: KILL [[DEF31]]
+    ; GCN-NEXT: KILL [[DEF32]]
+    ; GCN-NEXT: KILL [[DEF33]]
+    ; GCN-NEXT: KILL [[DEF34]]
+    ; GCN-NEXT: KILL [[DEF35]]
+    ; GCN-NEXT: KILL [[DEF36]]
+    ; GCN-NEXT: KILL [[DEF37]]
+    ; GCN-NEXT: KILL [[DEF38]]
+    ; GCN-NEXT: KILL [[DEF39]]
+    ; GCN-NEXT: KILL [[DEF40]]
+    ; GCN-NEXT: KILL [[DEF41]]
+    ; GCN-NEXT: KILL [[DEF42]]
+    ; GCN-NEXT: KILL [[DEF43]]
+    ; GCN-NEXT: KILL [[DEF44]]
+    ; GCN-NEXT: KILL [[DEF45]]
+    ; GCN-NEXT: KILL [[DEF46]]
+    ; GCN-NEXT: KILL [[DEF47]]
+    ; GCN-NEXT: KILL [[DEF48]]
+    ; GCN-NEXT: KILL [[DEF49]]
+    ; GCN-NEXT: KILL [[DEF50]]
+    ; GCN-NEXT: KILL [[DEF51]]
+    ; GCN-NEXT: KILL [[DEF52]]
+    ; GCN-NEXT: KILL [[DEF53]]
+    ; GCN-NEXT: KILL [[DEF54]]
+    ; GCN-NEXT: KILL [[DEF55]]
+    ; GCN-NEXT: KILL [[DEF56]]
+    ; GCN-NEXT: KILL [[DEF57]]
+    ; GCN-NEXT: KILL [[DEF58]]
+    ; GCN-NEXT: KILL [[DEF59]]
+    ; GCN-NEXT: KILL [[DEF60]]
+    ; GCN-NEXT: KILL [[DEF61]]
+    ; GCN-NEXT: KILL [[DEF62]]
+    ; GCN-NEXT: KILL [[DEF63]]
+    ; GCN-NEXT: KILL [[DEF64]]
+    ; GCN-NEXT: KILL [[DEF65]]
+    ; GCN-NEXT: KILL [[DEF66]]
+    ; GCN-NEXT: KILL [[DEF67]]
+    ; GCN-NEXT: KILL [[DEF68]]
+    ; GCN-NEXT: KILL [[DEF71]]
+    ; GCN-NEXT: KILL [[DEF72]]
+    ; GCN-NEXT: KILL [[DEF73]]
+    ; GCN-NEXT: KILL [[DEF74]]
+    ; GCN-NEXT: KILL [[DEF75]]
+    ; GCN-NEXT: KILL [[DEF76]]
+    ; GCN-NEXT: KILL [[DEF70]]
+    ; GCN-NEXT: KILL %70
+    ; GCN-NEXT: KILL %71
+    ; GCN-NEXT: KILL %72
+    ; GCN-NEXT: KILL %73
+    ; GCN-NEXT: KILL %74
+    ; GCN-NEXT: KILL %75
+    ; GCN-NEXT: KILL %76
+    ; GCN-NEXT: KILL %77
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:vgpr_32 = IMPLICIT_DEF
+    %6:vgpr_32 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %8:vgpr_32 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %43:vgpr_32 = IMPLICIT_DEF
+    %44:vgpr_32 = IMPLICIT_DEF
+    %45:vgpr_32 = IMPLICIT_DEF
+    %46:vgpr_32 = IMPLICIT_DEF
+    %47:vgpr_32 = IMPLICIT_DEF
+    %48:vgpr_32 = IMPLICIT_DEF
+    %49:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = IMPLICIT_DEF
+    %51:vgpr_32 = IMPLICIT_DEF
+    %52:vgpr_32 = IMPLICIT_DEF
+    %53:vgpr_32 = IMPLICIT_DEF
+    %54:vgpr_32 = IMPLICIT_DEF
+    %55:vgpr_32 = IMPLICIT_DEF
+    %56:vgpr_32 = IMPLICIT_DEF
+    %57:vgpr_32 = IMPLICIT_DEF
+    %58:vgpr_32 = IMPLICIT_DEF
+    %59:vgpr_32 = IMPLICIT_DEF
+    %60:vgpr_32 = IMPLICIT_DEF
+    %61:vgpr_32 = IMPLICIT_DEF
+    %62:vgpr_32 = IMPLICIT_DEF
+    %63:vgpr_32 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %65:vgpr_32 = IMPLICIT_DEF
+    %66:vgpr_32 = IMPLICIT_DEF
+    %67:vgpr_32 = IMPLICIT_DEF
+    %68:vgpr_32 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31, implicit %32, implicit %33, implicit %34, implicit %35, implicit %36, implicit %37, implicit %38, implicit %39, implicit %40, implicit %41, implicit %42, implicit %43, implicit %44, implicit %45, implicit %46, implicit %47, implicit %48, implicit %49, implicit %50, implicit %51, implicit %52, implicit %53, implicit %54, implicit %55, implicit %56, implicit %57, implicit %58, implicit %59, implicit %60, implicit %61, implicit %62, implicit %63, implicit %64, implicit %65, implicit %66
+    %69:sgpr_128 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %69, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28
+    KILL %0
+    KILL %1
+    KILL %2
+    KILL %3
+    KILL %4
+    KILL %5
+    KILL %6
+    KILL %7
+    KILL %8
+    KILL %9
+    KILL %10
+    KILL %12
+    KILL %13
+    KILL %14
+    KILL %15
+    KILL %16
+    KILL %17
+    KILL %18
+    KILL %19
+    KILL %69:sgpr_128, implicit-def %77:vgpr_32, implicit-def %78:vgpr_32, implicit-def %79:vgpr_32, implicit-def %80:vgpr_32, implicit-def %81:vgpr_32, implicit-def %82:vgpr_32, implicit-def %83:vgpr_32, implicit-def %84:vgpr_32
+    %70:vgpr_32 = IMPLICIT_DEF
+    %71:vgpr_32 = IMPLICIT_DEF
+    %72:vgpr_32 = IMPLICIT_DEF
+    %73:vgpr_32 = IMPLICIT_DEF
+    %74:vgpr_32 = IMPLICIT_DEF
+    %75:vgpr_32 = IMPLICIT_DEF
+    %76:sgpr_128 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %76, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+    KILL %20
+    KILL %11
+    KILL %21
+    KILL %22
+    KILL %23
+    KILL %24
+    KILL %25
+    KILL %26
+    KILL %27
+    KILL %28
+    KILL %29
+    KILL %30
+    KILL %31
+    KILL %32
+    KILL %33
+    KILL %34
+    KILL %35
+    KILL %36
+    KILL %37
+    KILL %38
+    KILL %39
+    KILL %40
+    KILL %41
+    KILL %42
+    KILL %43
+    KILL %44
+    KILL %45
+    KILL %46
+    KILL %47
+    KILL %48
+    KILL %49
+    KILL %50
+    KILL %51
+    KILL %52
+    KILL %53
+    KILL %54
+    KILL %55
+    KILL %56
+    KILL %57
+    KILL %58
+    KILL %59
+    KILL %60
+    KILL %61
+    KILL %62
+    KILL %63
+    KILL %64
+    KILL %65
+    KILL %66
+    KILL %67
+    KILL %68
+    KILL %70
+    KILL %71
+    KILL %72
+    KILL %73
+    KILL %74
+    KILL %75
+    KILL %76
+    KILL %77
+    KILL %78
+    KILL %79
+    KILL %80
+    KILL %81
+    KILL %82
+    KILL %83
+    KILL %84
+...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GCN: {{.*}}


        


More information about the llvm-commits mailing list