[llvm] [AMDGPU][MISCHED] GCNBalancedSchedStrategy. (PR #68229)

via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 4 08:40:37 PDT 2023


https://github.com/alex-t created https://github.com/llvm/llvm-project/pull/68229

The change implements the scheduling strategy which aims to find a reasonable trade-off between the ILP and occupancy. For that purpose, it computes the heuristic metric to decide if the current schedule is worth to be kept. This is an attempt to use the same idea as in the https://reviews.llvm.org/D139710 to replace the shouldRevertScheduling function. Unlike the https://reviews.llvm.org/D139710 the heuristic is applied to all scheduling stages.

>From f4463b3a1a9cdf063ae44d232b65e02e548a00f9 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Mon, 2 Oct 2023 18:15:50 +0200
Subject: [PATCH] [AMDGPU][MISCHED] GCNBalancedSchedStrategy.

The change implements the scheduling strategy which aims to find a reasonable trade-off between the ILP and occupancy.
For that purpose, it computes the heuristic metric to decide if the current schedule is worth to be kept.
This is an attempt to use the same idea as in the https://reviews.llvm.org/D139710 to replace the shouldRevertScheduling function.
Unlike the https://reviews.llvm.org/D139710 the heuristic is applied to all scheduling stages.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   23 +
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  242 +---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  139 +-
 .../GlobalISel/combine-fma-add-fma-mul.ll     |  104 +-
 .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll |   44 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |   64 +-
 .../CodeGen/AMDGPU/debug-value-scheduler.mir  |    2 -
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |  231 ++--
 llvm/test/CodeGen/AMDGPU/function-args.ll     |  895 ++++++-------
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |   18 +-
 .../AMDGPU/gfx-callable-return-types.ll       |  298 ++---
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    | 1188 ++++++++---------
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |  180 ++-
 .../machine-scheduler-sink-trivial-remats.mir |   48 -
 14 files changed, 1701 insertions(+), 1775 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index bcbc03eb2559c4f..78bb5790a727b39 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -348,6 +348,12 @@ static cl::opt<bool> EnableRewritePartialRegUses(
     cl::desc("Enable rewrite partial reg uses pass"), cl::init(false),
     cl::Hidden);
 
+static cl::opt<bool> EnableBalancedSchedStrategy(
+    "amdgpu-enable-balanced-scheduling-strategy",
+    cl::desc(
+        "Enable scheduling strategy to tradeoff between ILP and occupancy."),
+    cl::Hidden, cl::init(true));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -454,6 +460,20 @@ createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   return DAG;
 }
 
+static ScheduleDAGInstrs *
+createGCNBalancedMachineScheduler(MachineSchedContext *C) {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+  ScheduleDAGMILive *DAG =
+    new GCNScheduleDAGMILive(C, std::make_unique<GCNBalancedSchedStrategy>(C));
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createIGroupLPDAGMutation());
+  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+  DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  return DAG;
+}
+
 static ScheduleDAGInstrs *
 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
@@ -1139,6 +1159,9 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
   if (EnableMaxIlpSchedStrategy)
     return createGCNMaxILPMachineScheduler(C);
 
+  if (EnableBalancedSchedStrategy)
+    return createGCNBalancedMachineScheduler(C);
+
   return createGCNMaxOccupancyMachineScheduler(C);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ce481e1f1a8bc48..90d7094c4510418 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -51,8 +51,6 @@ static cl::opt<bool>
                         "Wave Limited (amdgpu-limit-wave-threshold)."),
                cl::init(false));
 
-const unsigned ScheduleMetrics::ScaleFactor = 100;
-
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
       HasHighPressure(false) {}
@@ -702,7 +700,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
   if (!GCNSchedStage::initGCNSchedStage())
     return false;
 
-  if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
+  if (DAG.RegionsWithExcessRP.none())
     return false;
 
   SavedMutations.swap(DAG.Mutations);
@@ -838,6 +836,7 @@ bool GCNSchedStage::initGCNRegion() {
 
   S.HasHighPressure = false;
   S.KnownExcessRP = isRegionWithExcessRP();
+  S.clearMetric();
 
   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
@@ -969,9 +968,7 @@ void GCNSchedStage::checkScheduling() {
     DAG.RegionsWithExcessRP[RegionIdx] = true;
   }
 
-  // Revert if this region's schedule would cause a drop in occupancy or
-  // spilling.
-  if (shouldRevertScheduling(WavesAfter)) {
+  if (shouldRevertScheduling(WavesAfter, WavesBefore)) {
     revertScheduling();
   } else {
     DAG.Pressure[RegionIdx] = PressureAfter;
@@ -980,193 +977,52 @@ void GCNSchedStage::checkScheduling() {
   }
 }
 
-unsigned
-GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
-                                      DenseMap<unsigned, unsigned> &ReadyCycles,
-                                      const TargetSchedModel &SM) {
-  unsigned ReadyCycle = CurrCycle;
-  for (auto &D : SU.Preds) {
-    if (D.isAssignedRegDep()) {
-      MachineInstr *DefMI = D.getSUnit()->getInstr();
-      unsigned Latency = SM.computeInstrLatency(DefMI);
-      unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
-      ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
-    }
-  }
-  ReadyCycles[SU.NodeNum] = ReadyCycle;
-  return ReadyCycle;
-}
-
-#ifndef NDEBUG
-struct EarlierIssuingCycle {
-  bool operator()(std::pair<MachineInstr *, unsigned> A,
-                  std::pair<MachineInstr *, unsigned> B) const {
-    return A.second < B.second;
-  }
-};
-
-static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
-                                        EarlierIssuingCycle> &ReadyCycles) {
-  if (ReadyCycles.empty())
-    return;
-  unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
-  dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
-         << " ##################\n# Cycle #\t\t\tInstruction          "
-            "             "
-            "                            \n";
-  unsigned IPrev = 1;
-  for (auto &I : ReadyCycles) {
-    if (I.second > IPrev + 1)
-      dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
-             << " CYCLES DETECTED ******************************\n\n";
-    dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
-    IPrev = I.second;
-  }
-}
-#endif
-
-ScheduleMetrics
-GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
-#ifndef NDEBUG
-  std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
-      ReadyCyclesSorted;
-#endif
-  const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
-  unsigned SumBubbles = 0;
-  DenseMap<unsigned, unsigned> ReadyCycles;
-  unsigned CurrCycle = 0;
-  for (auto &SU : InputSchedule) {
-    unsigned ReadyCycle =
-        computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
-    SumBubbles += ReadyCycle - CurrCycle;
-#ifndef NDEBUG
-    ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
-#endif
-    CurrCycle = ++ReadyCycle;
-  }
-#ifndef NDEBUG
-  LLVM_DEBUG(
-      printScheduleModel(ReadyCyclesSorted);
-      dbgs() << "\n\t"
-             << "Metric: "
-             << (SumBubbles
-                     ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
-                     : 1)
-             << "\n\n");
-#endif
-
-  return ScheduleMetrics(CurrCycle, SumBubbles);
-}
-
-ScheduleMetrics
-GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
-#ifndef NDEBUG
-  std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
-      ReadyCyclesSorted;
-#endif
-  const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
-  unsigned SumBubbles = 0;
-  DenseMap<unsigned, unsigned> ReadyCycles;
-  unsigned CurrCycle = 0;
-  for (auto &MI : DAG) {
-    SUnit *SU = DAG.getSUnit(&MI);
-    if (!SU)
-      continue;
-    unsigned ReadyCycle =
-        computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
-    SumBubbles += ReadyCycle - CurrCycle;
-#ifndef NDEBUG
-    ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
-#endif
-    CurrCycle = ++ReadyCycle;
-  }
-#ifndef NDEBUG
-  LLVM_DEBUG(
-      printScheduleModel(ReadyCyclesSorted);
-      dbgs() << "\n\t"
-             << "Metric: "
-             << (SumBubbles
-                     ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
-                     : 1)
-             << "\n\n");
-#endif
-
-  return ScheduleMetrics(CurrCycle, SumBubbles);
-}
-
-bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
+bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter,
+                                           unsigned WavesBefore) {
   if (WavesAfter < DAG.MinOccupancy)
     return true;
 
   return false;
 }
 
-bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter,
+                                                     unsigned WavesBefore) {
   if (PressureAfter == PressureBefore)
     return false;
 
-  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
-    return true;
-
   if (mayCauseSpilling(WavesAfter))
     return true;
 
-  return false;
+  return S.computeScheduleMetric(RegionIdx, WavesAfter, WavesBefore);
 }
 
-bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
-  // If RP is not reduced in the unclustered reschedule stage, revert to the
-  // old schedule.
-  if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
-       mayCauseSpilling(WavesAfter)) ||
-      GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
-    LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
-    return true;
-  }
-
-  // Do not attempt to relax schedule even more if we are already spilling.
-  if (isRegionWithExcessRP())
-    return false;
-
-  LLVM_DEBUG(
-      dbgs()
-      << "\n\t      *** In shouldRevertScheduling ***\n"
-      << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
-  ScheduleMetrics MBefore =
-      getScheduleMetrics(DAG.SUnits);
-  LLVM_DEBUG(
-      dbgs()
-      << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
-  ScheduleMetrics MAfter = getScheduleMetrics(DAG);
-  unsigned OldMetric = MBefore.getMetric();
-  unsigned NewMetric = MAfter.getMetric();
-  unsigned WavesBefore =
-      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
-  unsigned Profit =
-      ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
-       ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
-       NewMetric) /
-      ScheduleMetrics::ScaleFactor;
-  LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
-                    << MAfter << "Profit: " << Profit << "\n");
-  return Profit < ScheduleMetrics::ScaleFactor;
+bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter,
+                                                    unsigned WavesBefore) {
+  // Revert if may cause spilling. Otherwise rely on the metric computed by the
+  // strategy class. Exception: does not make sense to revert the unclustered
+  // schedule if we are still in excess RP state as it will not become better.
+  return GCNSchedStage::shouldRevertScheduling(WavesAfter, WavesBefore) ||
+         (S.computeScheduleMetric(RegionIdx, WavesAfter, WavesBefore) &&
+          !isRegionWithExcessRP());
 }
 
-bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
+bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter,
+                                                  unsigned WavesBefore) {
   if (PressureAfter == PressureBefore)
     return false;
 
-  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter, WavesBefore))
     return true;
 
   if (mayCauseSpilling(WavesAfter))
     return true;
 
-  return false;
+  return S.computeScheduleMetric(RegionIdx, WavesAfter, WavesBefore);
 }
 
-bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
-  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter,
+                                             unsigned WavesBefore) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter, WavesBefore))
     return true;
 
   if (mayCauseSpilling(WavesAfter))
@@ -1175,7 +1031,8 @@ bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
   return false;
 }
 
-bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter,
+                                                     unsigned WavesBefore) {
   if (mayCauseSpilling(WavesAfter))
     return true;
 
@@ -1569,3 +1426,52 @@ void GCNPostScheduleDAGMILive::finalizeSchedule() {
 
   ScheduleDAGMI::finalizeSchedule();
 }
+
+unsigned GCNBalancedSchedStrategy::computeSUnitReadyCycle(const SUnit &SU) {
+  unsigned ReadyCycle = CurrCycle;
+  for (auto &D : SU.Preds) {
+    if (D.isAssignedRegDep()) {
+      MachineInstr *DefMI = D.getSUnit()->getInstr();
+      unsigned Latency = SM->computeInstrLatency(DefMI);
+      unsigned DefReady = ReadyCycles[DAG->getSUnit(DefMI)->NodeNum];
+      ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
+    }
+  }
+  ReadyCycles[SU.NodeNum] = ReadyCycle;
+  return ReadyCycle;
+}
+
+bool llvm::GCNBalancedSchedStrategy::computeScheduleMetric(
+    unsigned RegionIdx, unsigned WavesAfter, unsigned WavesBefore) {
+  bool Result = false;
+  unsigned PrevMetric = 0;
+  if (Metrics.count(RegionIdx)) {
+    PrevMetric = Metrics[RegionIdx].back();
+  }
+  for (auto &MI : *DAG) {
+    SUnit *SU = DAG->getSUnit(&MI);
+    if (!SU)
+      continue;
+    unsigned ReadyCycle = computeSUnitReadyCycle(*SU);
+    StallTotal += ReadyCycle - CurrCycle;
+#ifndef NDEBUG
+    PrintableSchedule.insert(std::make_pair(SU->getInstr(), ReadyCycle));
+#endif
+    CurrCycle = ++ReadyCycle;
+  }
+  unsigned Metric = StallTotal ? StallTotal * ScaleFactor / CurrCycle : 1;
+#ifndef NDEBUG
+  LLVM_DEBUG(printSchedule());
+#endif
+  if (PrevMetric) {
+    unsigned Profit =
+        ((WavesAfter * ScaleFactor) / WavesBefore *
+         ((PrevMetric + ScheduleMetricBias) * ScaleFactor) / Metric) /
+        ScaleFactor;
+    Result = Profit < ScaleFactor;
+  }
+  if (!Result)
+    Metrics[RegionIdx].push_back(Metric);
+  clearMetric();
+  return Result;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 7862ec1e894b62e..84dbf0ce3e7913c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -116,6 +116,12 @@ class GCNSchedStrategy : public GenericScheduler {
   bool hasNextStage() const;
 
   GCNSchedStageID getNextStage() const;
+
+  virtual bool computeScheduleMetric(unsigned RegionIdx, unsigned WavesAfter,
+                                    unsigned WavesBefore) {
+    return false;
+  }
+  virtual void clearMetric(){};
 };
 
 /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
@@ -136,33 +142,6 @@ class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
   GCNMaxILPSchedStrategy(const MachineSchedContext *C);
 };
 
-class ScheduleMetrics {
-  unsigned ScheduleLength;
-  unsigned BubbleCycles;
-
-public:
-  ScheduleMetrics() {}
-  ScheduleMetrics(unsigned L, unsigned BC)
-      : ScheduleLength(L), BubbleCycles(BC) {}
-  unsigned getLength() const { return ScheduleLength; }
-  unsigned getBubbles() const { return BubbleCycles; }
-  unsigned getMetric() const {
-    unsigned Metric = (BubbleCycles * ScaleFactor) / ScheduleLength;
-    // Metric is zero if the amount of bubbles is less than 1% which is too
-    // small. So, return 1.
-    return Metric ? Metric : 1;
-  }
-  static const unsigned ScaleFactor;
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
-  dbgs() << "\n Schedule Metric (scaled by "
-         << ScheduleMetrics::ScaleFactor
-         << " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
-         << Sm.getLength() << " ]\n";
-  return OS;
-}
-
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
@@ -296,15 +275,9 @@ class GCNSchedStage {
   // Check result of scheduling.
   void checkScheduling();
 
-  // computes the given schedule virtual execution time in clocks
-  ScheduleMetrics getScheduleMetrics(const std::vector<SUnit> &InputSchedule);
-  ScheduleMetrics getScheduleMetrics(const GCNScheduleDAGMILive &DAG);
-  unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
-                                  DenseMap<unsigned, unsigned> &ReadyCycles,
-                                  const TargetSchedModel &SM);
-
   // Returns true if scheduling should be reverted.
-  virtual bool shouldRevertScheduling(unsigned WavesAfter);
+  virtual bool shouldRevertScheduling(unsigned WavesAfter,
+                                      unsigned WavesBefore);
 
   // Returns true if current region has known excess pressure.
   bool isRegionWithExcessRP() const {
@@ -324,7 +297,8 @@ class GCNSchedStage {
 
 class OccInitialScheduleStage : public GCNSchedStage {
 public:
-  bool shouldRevertScheduling(unsigned WavesAfter) override;
+  bool shouldRevertScheduling(unsigned WavesAfter,
+                              unsigned WavesBefore) override;
 
   OccInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
@@ -342,7 +316,8 @@ class UnclusteredHighRPStage : public GCNSchedStage {
 
   bool initGCNRegion() override;
 
-  bool shouldRevertScheduling(unsigned WavesAfter) override;
+  bool shouldRevertScheduling(unsigned WavesAfter,
+                              unsigned WavesBefore) override;
 
   UnclusteredHighRPStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
@@ -357,7 +332,8 @@ class ClusteredLowOccStage : public GCNSchedStage {
 
   bool initGCNRegion() override;
 
-  bool shouldRevertScheduling(unsigned WavesAfter) override;
+  bool shouldRevertScheduling(unsigned WavesAfter,
+                              unsigned WavesBefore) override;
 
   ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
@@ -393,7 +369,8 @@ class PreRARematStage : public GCNSchedStage {
 
   bool initGCNRegion() override;
 
-  bool shouldRevertScheduling(unsigned WavesAfter) override;
+  bool shouldRevertScheduling(unsigned WavesAfter,
+                              unsigned WavesBefore = 0) override;
 
   PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
@@ -401,7 +378,8 @@ class PreRARematStage : public GCNSchedStage {
 
 class ILPInitialScheduleStage : public GCNSchedStage {
 public:
-  bool shouldRevertScheduling(unsigned WavesAfter) override;
+  bool shouldRevertScheduling(unsigned WavesAfter,
+                              unsigned WavesBefore) override;
 
   ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
@@ -423,6 +401,87 @@ class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
                            bool RemoveKillFlags);
 };
 
+#ifndef NDEBUG
+struct EarlierIssuingCycle {
+  bool operator()(std::pair<MachineInstr *, unsigned> A,
+                  std::pair<MachineInstr *, unsigned> B) const {
+    return A.second < B.second;
+  }
+};
+#endif
+
+/// The goal of this scheduling strategy is to find a reasonable tradeof between
+/// the kernel occupancy (i.e. maximum number of waves per simd). and ILP (i.e.
+/// minimize the amount of stall cycles by means of the better latency
+/// covering).
+class GCNBalancedSchedStrategy final : public GCNSchedStrategy {
+
+  const unsigned ScaleFactor = 100;
+  unsigned StallTotal = 0;
+  unsigned CurrCycle = 0;
+  DenseMap<unsigned, unsigned> ReadyCycles;
+  DenseMap<unsigned, SmallVector<unsigned, 4>> Metrics;
+  const TargetSchedModel *SM;
+  unsigned computeSUnitReadyCycle(const SUnit &SU);
+
+  void clearMetric() override {
+    StallTotal = 0;
+    CurrCycle = 0;
+    ReadyCycles.clear();
+#ifndef NDEBUG
+    PrintableSchedule.clear();
+#endif
+  }
+
+#ifndef NDEBUG
+  std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> PrintableSchedule;
+
+  void printSchedule() {
+    if (PrintableSchedule.empty())
+      return;
+
+    unsigned BBNum = PrintableSchedule.begin()->first->getParent()->getNumber();
+    dbgs() << "\n################## Schedule time ReadyCycles for MBB : "
+           << BBNum
+           << " ##################\n# Cycle #\t\t\tInstruction          "
+              "             "
+              "                            \n";
+    unsigned IPrev = 1;
+    for (auto &I : PrintableSchedule) {
+      if (I.second > IPrev + 1)
+        dbgs() << "****************************** BUBBLE OF "
+               << I.second - IPrev
+               << " CYCLES DETECTED ******************************\n\n";
+      dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
+      IPrev = I.second;
+    }
+    dbgs() << "\n\t"
+             << "Metric: "
+             << (StallTotal
+                     ? (StallTotal * ScaleFactor) / CurrCycle
+                     : 1)
+             << "\n\n";
+  }
+#endif
+
+public:
+  GCNBalancedSchedStrategy(const MachineSchedContext *C) : GCNSchedStrategy(C) {
+    SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+    SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
+    SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
+    SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
+  }
+
+  void initialize(ScheduleDAGMI *DAG) override {
+    GCNSchedStrategy::initialize(DAG);
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    SM = &ST.getInstrInfo()->getSchedModel();
+  }
+
+  bool computeScheduleMetric(unsigned RegionIdx, unsigned WavesAfter,
+                            unsigned WavesBefore) override;
+};
+
 } // End namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
index 3c9fae3efc30b1b..34c330569eba034 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
@@ -699,24 +699,24 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -725,24 +725,24 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX9-DENORM-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX9-DENORM-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX9-DENORM-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX9-DENORM-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
-; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
+; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-DENORM-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
-; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
-; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
-; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
-; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -870,24 +870,24 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-CONTRACT-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
-; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -896,24 +896,24 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX9-DENORM-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX9-DENORM-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX9-DENORM-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX9-DENORM-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
-; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
+; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-DENORM-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
-; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
-; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
-; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
-; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index b97eba8e70b4988..09482fd80ab435b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -1348,29 +1348,29 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v16
 ; GFX6-NEXT:    v_min_u32_e32 v16, v3, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v16
-; GFX6-NEXT:    v_min_u32_e32 v16, v4, v20
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v16
-; GFX6-NEXT:    v_min_u32_e32 v16, v5, v21
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v16
-; GFX6-NEXT:    v_min_u32_e32 v16, v6, v22
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v16
-; GFX6-NEXT:    v_min_u32_e32 v16, v7, v23
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v16
-; GFX6-NEXT:    v_min_u32_e32 v16, v8, v24
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v16
-; GFX6-NEXT:    v_min_u32_e32 v16, v9, v25
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v16
-; GFX6-NEXT:    v_min_u32_e32 v16, v10, v26
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v16
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX6-NEXT:    v_min_u32_e32 v17, v11, v27
-; GFX6-NEXT:    v_min_u32_e32 v18, v12, v28
-; GFX6-NEXT:    v_min_u32_e32 v19, v13, v29
-; GFX6-NEXT:    v_min_u32_e32 v20, v14, v30
-; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v17
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v18
-; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v19
-; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v20
+; GFX6-NEXT:    v_min_u32_e32 v17, v4, v20
+; GFX6-NEXT:    v_min_u32_e32 v18, v5, v21
+; GFX6-NEXT:    v_min_u32_e32 v19, v6, v22
+; GFX6-NEXT:    v_min_u32_e32 v20, v7, v23
+; GFX6-NEXT:    v_min_u32_e32 v21, v8, v24
+; GFX6-NEXT:    v_min_u32_e32 v22, v9, v25
+; GFX6-NEXT:    v_min_u32_e32 v23, v10, v26
+; GFX6-NEXT:    v_min_u32_e32 v24, v11, v27
+; GFX6-NEXT:    v_min_u32_e32 v25, v12, v28
+; GFX6-NEXT:    v_min_u32_e32 v26, v13, v29
+; GFX6-NEXT:    v_min_u32_e32 v27, v14, v30
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v17
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v18
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v19
+; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v20
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v21
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v22
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v23
+; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v24
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v25
+; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v26
+; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v27
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v16, v15, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0a81e98005b1adf..e8f5b88db657863 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2800,6 +2800,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
 ; GCN-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x64, v0
 ; GCN-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
@@ -2814,7 +2815,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x50, v0
 ; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x4c, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    buffer_store_dword v26, v29, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x48, v0
@@ -2897,21 +2897,21 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; GFX7-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
+; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 0x5c, v0
 ; GFX7-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; GFX7-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
+; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 0x54, v0
+; GFX7-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
 ; GFX7-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
+; GFX7-NEXT:    v_add_i32_e32 v25, vcc, 0x50, v0
+; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 0x4c, v0
+; GFX7-NEXT:    v_add_i32_e32 v24, vcc, 0x44, v0
+; GFX7-NEXT:    buffer_store_dword v23, v26, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v22, v25, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v21, v27, s[0:3], 0 offen
 ; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
-; GFX7-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v19, v24, s[0:3], 0 offen
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -2975,21 +2975,21 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
 ; GFX8-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
+; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 0x5c, v0
 ; GFX8-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
-; GFX8-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0x54, v0
+; GFX8-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
-; GFX8-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
-; GFX8-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
-; GFX8-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
+; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 0x50, v0
+; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 0x4c, v0
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0x44, v0
+; GFX8-NEXT:    buffer_store_dword v23, v26, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v22, v25, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v21, v27, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
-; GFX8-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v19, v24, s[0:3], 0 offen
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
 ; GFX8-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
@@ -3035,20 +3035,20 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
 ; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
 ; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
 ; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
 ; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
 ; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
 ; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
@@ -3065,11 +3065,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(25)
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    s_waitcnt vmcnt(25)
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:116
 ; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir
index 24e80c9c177c3ed..87007bdf42d7d21 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir
@@ -7,13 +7,11 @@
 # CHECK-NEXT:   From: DBG_VALUE %17:vgpr_32, 0, 0
 # CHECK-NEXT:     To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32
 # CHECK-NEXT:  RegionInstrs: 46
-# CHECK: Attempting to revert scheduling.
 # CHECK: ********** MI Scheduling **********
 # CHECK: test_same_num_instrs:%bb.2
 # CHECK-NEXT:   From: DBG_VALUE %17:vgpr_32, 0, 0
 # CHECK-NEXT:     To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32
 # CHECK-NEXT:  RegionInstrs: 46
-# CHECK: Attempting to revert scheduling.
 
 ---
 name:            test_same_num_instrs
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 27c42a9ea0db60a..ef6b9210a4ff2aa 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3141,49 +3141,49 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v32f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v27, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
-; VI-NEXT:    v_or_b32_e32 v0, v0, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v27
+; VI-NEXT:    v_max_f16_sdwa v27, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v1, v1, v1
-; VI-NEXT:    v_or_b32_e32 v1, v1, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v1, v1, v27
+; VI-NEXT:    v_max_f16_sdwa v27, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v2, v2, v2
-; VI-NEXT:    v_or_b32_e32 v2, v2, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v2, v2, v27
+; VI-NEXT:    v_max_f16_sdwa v27, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v3, v3, v3
-; VI-NEXT:    v_or_b32_e32 v3, v3, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_max_f16_e32 v4, v4, v4
-; VI-NEXT:    v_or_b32_e32 v4, v4, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_max_f16_e32 v5, v5, v5
-; VI-NEXT:    v_or_b32_e32 v5, v5, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_max_f16_e32 v6, v6, v6
-; VI-NEXT:    v_or_b32_e32 v6, v6, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_max_f16_e32 v7, v7, v7
-; VI-NEXT:    v_or_b32_e32 v7, v7, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_max_f16_e32 v8, v8, v8
-; VI-NEXT:    v_or_b32_e32 v8, v8, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_max_f16_e32 v9, v9, v9
-; VI-NEXT:    v_or_b32_e32 v9, v9, v20
-; VI-NEXT:    v_max_f16_sdwa v20, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_max_f16_e32 v10, v10, v10
 ; VI-NEXT:    v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v10, v10, v20
 ; VI-NEXT:    v_max_f16_sdwa v20, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v21, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v22, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v23, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v24, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v25, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v26, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v3, v3, v27
+; VI-NEXT:    v_max_f16_sdwa v27, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v15, v15, v15
 ; VI-NEXT:    v_max_f16_e32 v14, v14, v14
 ; VI-NEXT:    v_max_f16_e32 v13, v13, v13
 ; VI-NEXT:    v_max_f16_e32 v12, v12, v12
 ; VI-NEXT:    v_max_f16_e32 v11, v11, v11
+; VI-NEXT:    v_max_f16_e32 v10, v10, v10
+; VI-NEXT:    v_max_f16_e32 v9, v9, v9
+; VI-NEXT:    v_max_f16_e32 v8, v8, v8
+; VI-NEXT:    v_max_f16_e32 v7, v7, v7
+; VI-NEXT:    v_max_f16_e32 v6, v6, v6
+; VI-NEXT:    v_max_f16_e32 v5, v5, v5
+; VI-NEXT:    v_max_f16_e32 v4, v4, v4
+; VI-NEXT:    v_or_b32_e32 v4, v4, v27
+; VI-NEXT:    v_or_b32_e32 v5, v5, v26
+; VI-NEXT:    v_or_b32_e32 v6, v6, v25
+; VI-NEXT:    v_or_b32_e32 v7, v7, v24
+; VI-NEXT:    v_or_b32_e32 v8, v8, v23
+; VI-NEXT:    v_or_b32_e32 v9, v9, v22
+; VI-NEXT:    v_or_b32_e32 v10, v10, v21
 ; VI-NEXT:    v_or_b32_e32 v11, v11, v20
 ; VI-NEXT:    v_or_b32_e32 v12, v12, v19
 ; VI-NEXT:    v_or_b32_e32 v13, v13, v18
@@ -3488,9 +3488,9 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v8, v16
 ; CI-NEXT:    v_or_b32_e32 v7, v9, v7
 ; CI-NEXT:    v_cvt_f16_f32_e32 v9, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v28
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v29
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v25
 ; CI-NEXT:    v_or_b32_e32 v8, v9, v8
 ; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
 ; CI-NEXT:    v_cvt_f16_f32_e32 v10, v20
@@ -3501,30 +3501,30 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
 ; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v29
 ; CI-NEXT:    v_or_b32_e32 v10, v11, v10
 ; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
 ; CI-NEXT:    v_cvt_f16_f32_e32 v12, v24
 ; CI-NEXT:    v_or_b32_e32 v11, v13, v11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v23
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:24
-; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v27
 ; CI-NEXT:    v_cvt_f16_f32_e32 v24, v30
+; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; CI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; CI-NEXT:    v_or_b32_e32 v12, v13, v12
 ; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; CI-NEXT:    v_or_b32_e32 v22, v23, v22
+; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
 ; CI-NEXT:    v_or_b32_e32 v13, v15, v13
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v28
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v27
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:36
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
-; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:40
-; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; CI-NEXT:    v_or_b32_e32 v14, v15, v14
-; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; CI-NEXT:    v_or_b32_e32 v15, v25, v15
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
+; CI-NEXT:    v_or_b32_e32 v23, v25, v23
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:44
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:40
 ; CI-NEXT:    s_waitcnt vmcnt(11)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; CI-NEXT:    s_waitcnt vmcnt(10)
@@ -3538,58 +3538,55 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
 ; CI-NEXT:    v_or_b32_e32 v17, v19, v17
 ; CI-NEXT:    s_waitcnt vmcnt(7)
-; CI-NEXT:    v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CI-NEXT:    s_waitcnt vmcnt(6)
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v21
-; CI-NEXT:    s_waitcnt vmcnt(5)
-; CI-NEXT:    v_cvt_f16_f32_e32 v20, v22
-; CI-NEXT:    s_waitcnt vmcnt(4)
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v23
-; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; CI-NEXT:    v_or_b32_e32 v18, v19, v18
-; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; CI-NEXT:    v_or_b32_e32 v19, v21, v19
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v20, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v20
 ; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v21
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
-; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; CI-NEXT:    v_or_b32_e32 v20, v21, v20
-; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
-; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:48
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:60
-; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; CI-NEXT:    v_or_b32_e32 v21, v27, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; CI-NEXT:    v_or_b32_e32 v14, v15, v14
+; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v18
+; CI-NEXT:    v_or_b32_e32 v24, v25, v24
+; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; CI-NEXT:    v_or_b32_e32 v15, v19, v15
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:56
+; CI-NEXT:    v_or_b32_e32 v25, v27, v25
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:132
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:128
 ; CI-NEXT:    s_waitcnt vmcnt(5)
-; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
 ; CI-NEXT:    s_waitcnt vmcnt(4)
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; CI-NEXT:    v_or_b32_e32 v24, v25, v24
+; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; CI-NEXT:    v_or_b32_e32 v18, v19, v18
 ; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; CI-NEXT:    v_or_b32_e32 v26, v27, v26
 ; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x7c, v0
 ; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:124
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:120
-; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT:    v_or_b32_e32 v22, v22, v23
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v20
+; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; CI-NEXT:    v_or_b32_e32 v19, v21, v19
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
 ; CI-NEXT:    s_waitcnt vmcnt(2)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    s_waitcnt vmcnt(1)
@@ -3601,7 +3598,7 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:116
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
 ; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3613,77 +3610,77 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v26
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v27
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:92
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
+; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; CI-NEXT:    v_or_b32_e32 v20, v26, v20
 ; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x70, v0
-; CI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
+; CI-NEXT:    buffer_store_dword v20, v26, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:100
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96
 ; CI-NEXT:    s_waitcnt vmcnt(3)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; CI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT:    v_or_b32_e32 v23, v23, v27
+; CI-NEXT:    v_or_b32_e32 v21, v21, v27
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x68, v0
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
+; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; CI-NEXT:    v_or_b32_e32 v20, v26, v20
 ; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x6c, v0
-; CI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
+; CI-NEXT:    buffer_store_dword v20, v26, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
 ; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
-; CI-NEXT:    buffer_store_dword v23, v27, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:76
+; CI-NEXT:    buffer_store_dword v21, v27, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:76
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:72
 ; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:84
 ; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:80
 ; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
+; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; CI-NEXT:    v_or_b32_e32 v20, v26, v20
 ; CI-NEXT:    s_waitcnt vmcnt(2)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v27
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
-; CI-NEXT:    v_or_b32_e32 v23, v26, v23
+; CI-NEXT:    v_or_b32_e32 v21, v26, v21
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v28
 ; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; CI-NEXT:    v_or_b32_e32 v26, v27, v26
 ; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x64, v0
 ; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x60, v0
-; CI-NEXT:    buffer_store_dword v23, v26, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x5c, v0
-; CI-NEXT:    buffer_store_dword v25, v23, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x58, v0
-; CI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v22, vcc, 0x54, v0
-; CI-NEXT:    buffer_store_dword v24, v22, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v22, vcc, 0x50, v0
-; CI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v21, vcc, 0x4c, v0
+; CI-NEXT:    buffer_store_dword v21, v26, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v21, vcc, 0x5c, v0
 ; CI-NEXT:    buffer_store_dword v20, v21, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v20, vcc, 0x48, v0
+; CI-NEXT:    v_add_i32_e32 v20, vcc, 0x58, v0
 ; CI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v19, vcc, 0x44, v0
+; CI-NEXT:    v_add_i32_e32 v19, vcc, 0x54, v0
 ; CI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v18, vcc, 64, v0
-; CI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v17, vcc, 60, v0
-; CI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
-; CI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v15, vcc, 52, v0
+; CI-NEXT:    v_add_i32_e32 v18, vcc, 0x50, v0
+; CI-NEXT:    buffer_store_dword v25, v18, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v18, vcc, 0x4c, v0
+; CI-NEXT:    buffer_store_dword v24, v18, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v18, vcc, 0x48, v0
+; CI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v15, vcc, 0x44, v0
 ; CI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v14, vcc, 64, v0
+; CI-NEXT:    buffer_store_dword v17, v14, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v14, vcc, 60, v0
+; CI-NEXT:    buffer_store_dword v16, v14, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v14, vcc, 56, v0
+; CI-NEXT:    buffer_store_dword v23, v14, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v14, vcc, 52, v0
+; CI-NEXT:    buffer_store_dword v22, v14, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
 ; CI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v13, vcc, 44, v0
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 01dcc26566663be..110c81e84057ec1 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -670,19 +670,19 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    s_waitcnt vmcnt(8)
 ; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(5)
-; CI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -691,19 +691,19 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -712,21 +712,21 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2634,13 +2634,13 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2649,9 +2649,9 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; CI-NEXT:    buffer_store_dword v26, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx2 v[24:25], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2666,13 +2666,13 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2681,9 +2681,9 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v26, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[24:25], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2698,14 +2698,14 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2714,9 +2714,9 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v26, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[24:25], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2773,14 +2773,14 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_ubyte v25, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2789,15 +2789,15 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v20
-; CI-NEXT:    v_and_b32_e32 v0, 1, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_and_b32_e32 v0, 1, v25
 ; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v26, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v27, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v24, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2812,14 +2812,14 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v24, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2828,14 +2828,14 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, 1, v20
+; VI-NEXT:    v_and_b32_e32 v0, 1, v24
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v25, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v26, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v27, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2850,16 +2850,15 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v24, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2868,14 +2867,14 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v24
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v25, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v26, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v27, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2941,14 +2940,14 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2957,9 +2956,9 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx2 v[24:25], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx2 v[26:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2974,14 +2973,14 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2990,9 +2989,9 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[24:25], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[26:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3007,15 +3006,15 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -3024,9 +3023,9 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[24:25], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[26:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3083,17 +3082,18 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v20
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -3102,13 +3102,13 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v26, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v27, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v25, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v24, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3123,12 +3123,12 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -3137,9 +3137,9 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v24, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v25, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3154,14 +3154,13 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -3170,9 +3169,9 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v24, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v25, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3227,29 +3226,29 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3264,29 +3263,29 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3301,31 +3300,31 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3388,27 +3387,27 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3425,27 +3424,27 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3462,29 +3461,28 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3545,41 +3543,41 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
-; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48
-; CI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
-; CI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40
-; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3594,41 +3592,41 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:56
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3643,45 +3641,44 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3756,66 +3753,66 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:44
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:96
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:92
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:88
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
-; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
-; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:112
+; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:104
+; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:100
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96
-; CI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92
-; CI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
-; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112
-; CI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108
-; CI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104
-; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
-; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:128
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:120
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
-; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:80
-; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:76
-; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72
-; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
-; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:80
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:72
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_v16i32_v16f32:
@@ -3829,66 +3826,66 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:56
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:96
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:88
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:112
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:104
+; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:128
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:120
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:80
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72
-; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:80
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v16i32_v16f32:
@@ -3902,74 +3899,72 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v16i32_v16f32:
@@ -4260,65 +4255,65 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:64
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:44
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:48
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:24
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
-; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:24
-; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:16
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v25, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v24, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v22, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v21, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v14, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v27, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v13, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v26, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v12, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v23, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v8, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v15, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v10, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v19, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v9, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v11, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v12, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v4, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v13, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v5, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v14, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v6, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v15, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4333,65 +4328,65 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v24, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ubyte v25, off, s[0:3], s32 offset:64
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:48
+; VI-NEXT:    buffer_load_ubyte v21, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ubyte v22, off, s[0:3], s32 offset:56
+; VI-NEXT:    buffer_load_ubyte v23, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ubyte v26, off, s[0:3], s32 offset:40
+; VI-NEXT:    buffer_load_ubyte v27, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ubyte v18, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ubyte v19, off, s[0:3], s32 offset:56
+; VI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_ubyte v18, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ubyte v19, off, s[0:3], s32 offset:24
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v12, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ubyte v13, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_ubyte v14, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ubyte v15, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ubyte v12, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ubyte v13, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ubyte v14, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_ubyte v15, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v8, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ubyte v9, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ubyte v10, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ubyte v11, off, s[0:3], s32 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ubyte v5, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ubyte v6, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v25, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v24, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v19, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v22, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v21, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v14, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v27, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v13, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v26, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v12, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v23, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v8, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v15, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v10, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v9, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v11, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v12, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v4, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v13, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v5, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v14, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v6, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v15, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4406,69 +4401,69 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v24, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ubyte v25, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_ubyte v21, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ubyte v22, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_ubyte v23, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ubyte v26, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_ubyte v27, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ubyte v18, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ubyte v19, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ubyte v18, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ubyte v19, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v12, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ubyte v13, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_ubyte v14, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ubyte v15, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ubyte v12, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ubyte v13, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ubyte v14, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_ubyte v15, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v8, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ubyte v9, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ubyte v10, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ubyte v11, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ubyte v5, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ubyte v6, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v25, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v20, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v24, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v19, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v22, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v21, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v14, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v27, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v13, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v26, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v12, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v23, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v8, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v15, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v10, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v9, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v11, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v12, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v4, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v13, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v5, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v14, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v6, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v15, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 9df4b903de4bda0..9a49d363cfd4f1e 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1496,8 +1496,8 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
 ; GFX9-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
-; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[4:7], 0 offset:128
+; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
 ; GFX9-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
 ; GFX9-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
 ; GFX9-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
@@ -1514,13 +1514,13 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
 ; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:92
 ; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:88
 ; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:84
 ; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:80
 ; GFX9-NEXT:    s_waitcnt vmcnt(17)
-; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:76
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:72
 ; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:68
@@ -1789,8 +1789,8 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
 ; GFX9-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
-; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[4:7], 0 offset:128
+; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
 ; GFX9-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
 ; GFX9-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
 ; GFX9-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
@@ -1807,13 +1807,13 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
 ; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:92
 ; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:88
 ; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:84
 ; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:80
 ; GFX9-NEXT:    s_waitcnt vmcnt(17)
-; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:76
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:72
 ; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:68
@@ -2082,8 +2082,8 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240
 ; GFX9-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224
-; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208
 ; GFX9-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192
 ; GFX9-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176
 ; GFX9-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160
@@ -2100,13 +2100,13 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:228
 ; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:224
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
 ; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
 ; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:212
 ; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:208
 ; GFX9-NEXT:    s_waitcnt vmcnt(17)
-; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:204
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:200
 ; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:196
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 8dd73c5ab32fbc4..3b9e7302a67dd49 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -2399,125 +2399,125 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:160
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:152
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:144
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:136
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:284
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:280
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:152
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:276
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:272
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:268
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:264
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:136
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:260
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:256
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:280
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:276
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:272
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:268
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:264
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:260
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:256
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:252
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:248
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:244
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:240
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:236
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:232
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:228
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:224
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:248
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:244
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:240
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:236
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:232
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:228
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:224
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:220
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:216
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:212
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:208
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:204
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:200
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:196
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:192
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:216
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:212
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:208
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:204
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:200
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:196
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:192
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:188
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:184
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:180
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:176
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:172
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:168
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:164
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:160
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:184
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:180
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:176
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:172
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:168
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:164
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:160
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:152
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:148
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:144
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:140
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:136
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:132
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:152
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:140
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:132
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:128
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:124
@@ -2844,8 +2844,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7fc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xffff8000
-; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
@@ -2853,7 +2853,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_addc_u32 s35, s35, return_72xi32 at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
@@ -2911,7 +2910,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x200, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2944,27 +2943,27 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0
-; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:648
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:652
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:656
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:660
-; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:664
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:668
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:672
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:676
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:680
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:684
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:688
-; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:692
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:696
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:700
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:704
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:648
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:652
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:656
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:660
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:664
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:668
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:672
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:676
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:680
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:684
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:688
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:692
+; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:696
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:700
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:704
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:708
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:712
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:716
@@ -3036,21 +3035,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80
@@ -3101,11 +3100,10 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffd8000
 ; GFX9-NEXT:    s_mov_b32 s33, s36
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index ae470efc92feee4..a21195db6c36317 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -1024,6 +1024,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x44
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v4, 1, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v5, 2, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v6, 3, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v7, 4, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v8, 5, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v9, 6, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v10, 7, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v11, 8, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v12, 9, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v13, 10, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v14, 11, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v15, 12, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 15, s4
 ; GCN-NEXT:    s_lshr_b32 s1, s4, 24
 ; GCN-NEXT:    s_lshr_b32 s8, s4, 16
 ; GCN-NEXT:    s_lshr_b32 s9, s4, 17
@@ -1032,861 +1048,832 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_lshr_b32 s12, s4, 20
 ; GCN-NEXT:    s_lshr_b32 s13, s4, 21
 ; GCN-NEXT:    s_lshr_b32 s14, s4, 22
-; GCN-NEXT:    s_lshr_b32 s15, s4, 23
-; GCN-NEXT:    s_lshr_b32 s16, s5, 24
-; GCN-NEXT:    s_lshr_b32 s17, s5, 16
-; GCN-NEXT:    s_lshr_b32 s18, s5, 17
-; GCN-NEXT:    s_lshr_b32 s19, s5, 18
-; GCN-NEXT:    s_lshr_b32 s20, s5, 19
-; GCN-NEXT:    s_lshr_b32 s21, s5, 20
-; GCN-NEXT:    s_lshr_b32 s22, s5, 21
-; GCN-NEXT:    s_lshr_b32 s23, s5, 22
-; GCN-NEXT:    s_lshr_b32 s24, s5, 23
-; GCN-NEXT:    s_lshr_b32 s25, s6, 24
-; GCN-NEXT:    s_lshr_b32 s26, s6, 16
-; GCN-NEXT:    s_lshr_b32 s27, s6, 17
-; GCN-NEXT:    s_lshr_b32 s28, s6, 18
-; GCN-NEXT:    s_lshr_b32 s29, s6, 19
-; GCN-NEXT:    s_lshr_b32 s30, s6, 20
-; GCN-NEXT:    s_lshr_b32 s31, s6, 21
-; GCN-NEXT:    s_lshr_b32 s33, s6, 22
-; GCN-NEXT:    s_lshr_b32 s34, s6, 23
-; GCN-NEXT:    s_lshr_b32 s35, s7, 24
-; GCN-NEXT:    s_lshr_b32 s36, s7, 16
-; GCN-NEXT:    s_lshr_b32 s37, s7, 17
-; GCN-NEXT:    s_lshr_b32 s38, s7, 18
-; GCN-NEXT:    s_lshr_b32 s39, s7, 19
-; GCN-NEXT:    s_lshr_b32 s40, s7, 20
-; GCN-NEXT:    s_lshr_b32 s41, s7, 21
-; GCN-NEXT:    s_lshr_b32 s42, s7, 22
-; GCN-NEXT:    s_lshr_b32 s43, s7, 23
+; GCN-NEXT:    s_lshr_b32 s4, s4, 23
+; GCN-NEXT:    s_lshr_b32 s15, s5, 24
+; GCN-NEXT:    s_lshr_b32 s16, s5, 16
+; GCN-NEXT:    s_lshr_b32 s17, s5, 17
+; GCN-NEXT:    s_lshr_b32 s18, s5, 18
+; GCN-NEXT:    s_lshr_b32 s19, s5, 19
+; GCN-NEXT:    s_lshr_b32 s20, s5, 20
+; GCN-NEXT:    s_lshr_b32 s21, s5, 21
+; GCN-NEXT:    s_lshr_b32 s22, s5, 22
+; GCN-NEXT:    s_lshr_b32 s23, s5, 23
+; GCN-NEXT:    s_lshr_b32 s24, s6, 24
+; GCN-NEXT:    s_lshr_b32 s25, s6, 16
+; GCN-NEXT:    s_lshr_b32 s26, s6, 17
+; GCN-NEXT:    s_lshr_b32 s27, s6, 18
+; GCN-NEXT:    s_lshr_b32 s28, s6, 19
+; GCN-NEXT:    s_lshr_b32 s29, s6, 20
+; GCN-NEXT:    s_lshr_b32 s30, s6, 21
+; GCN-NEXT:    s_lshr_b32 s31, s6, 22
+; GCN-NEXT:    s_lshr_b32 s33, s6, 23
+; GCN-NEXT:    s_lshr_b32 s34, s7, 24
+; GCN-NEXT:    s_lshr_b32 s35, s7, 16
+; GCN-NEXT:    s_lshr_b32 s36, s7, 17
+; GCN-NEXT:    s_lshr_b32 s37, s7, 18
+; GCN-NEXT:    s_lshr_b32 s38, s7, 19
+; GCN-NEXT:    s_lshr_b32 s39, s7, 20
+; GCN-NEXT:    s_lshr_b32 s40, s7, 21
+; GCN-NEXT:    s_lshr_b32 s41, s7, 22
+; GCN-NEXT:    s_lshr_b32 s42, s7, 23
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x77
-; GCN-NEXT:    v_mov_b32_e32 v15, s43
+; GCN-NEXT:    v_mov_b32_e32 v22, s42
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x76
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s42
+; GCN-NEXT:    v_cndmask_b32_e32 v22, 1, v22, vcc
+; GCN-NEXT:    v_mov_b32_e32 v25, s41
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 3, v22
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 2, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x75
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v18
-; GCN-NEXT:    v_mov_b32_e32 v18, s41
+; GCN-NEXT:    v_or_b32_e32 v22, v22, v25
+; GCN-NEXT:    v_mov_b32_e32 v25, s40
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x74
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s40
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_mov_b32_e32 v26, s39
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 1, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 3, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x73
-; GCN-NEXT:    v_or_b32_e32 v15, v18, v15
-; GCN-NEXT:    v_mov_b32_e32 v18, s39
+; GCN-NEXT:    v_or_b32_e32 v22, v25, v22
+; GCN-NEXT:    v_mov_b32_e32 v25, s38
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x72
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s38
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_mov_b32_e32 v26, s37
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 3, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 2, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x71
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_mov_b32_e32 v19, s37
+; GCN-NEXT:    v_or_b32_e32 v25, v25, v26
+; GCN-NEXT:    v_mov_b32_e32 v26, s36
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x70
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_mov_b32_e32 v20, s36
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v18, 15, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_mov_b32_e32 v27, s35
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v26, v27, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 3, v26
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 4, v22
+; GCN-NEXT:    v_and_b32_e32 v25, 15, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7f
-; GCN-NEXT:    v_or_b32_e32 v15, v18, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s35
+; GCN-NEXT:    v_or_b32_e32 v22, v25, v22
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 7, s34
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7e
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 6, s34
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 3, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 2, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7d
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s35
+; GCN-NEXT:    v_or_b32_e32 v25, v25, v26
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 5, s34
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7c
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 4, s34
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v26, v27, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 3, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7b
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s35
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 3, s34
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7a
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 2, s34
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x78
-; GCN-NEXT:    v_mov_b32_e32 v13, s35
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
+; GCN-NEXT:    v_mov_b32_e32 v20, s34
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 3, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 2, v27
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x79
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_or_b32_e32 v26, v26, v27
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 1, s34
 ; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v13, v13, v20
-; GCN-NEXT:    v_and_b32_e32 v13, 3, v13
-; GCN-NEXT:    v_or_b32_e32 v19, v13, v19
-; GCN-NEXT:    v_mov_b32_e32 v13, 15
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
-; GCN-NEXT:    v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v20, v20, v27
+; GCN-NEXT:    v_and_b32_e32 v20, 3, v20
+; GCN-NEXT:    v_or_b32_e32 v26, v20, v26
+; GCN-NEXT:    v_mov_b32_e32 v20, 15
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 12, v25
+; GCN-NEXT:    v_and_b32_sdwa v26, v26, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v25, v25, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6f
-; GCN-NEXT:    v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 15, s7
+; GCN-NEXT:    v_or_b32_sdwa v22, v22, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 15, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6e
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 14, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 14, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 3, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 2, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6d
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 13, s7
+; GCN-NEXT:    v_or_b32_e32 v25, v25, v26
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 13, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6c
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 12, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 12, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v26, v27, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 3, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6b
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 11, s7
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 11, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6a
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 10, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 10, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 3, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 2, v27
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x69
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 9, s7
+; GCN-NEXT:    v_or_b32_e32 v26, v26, v27
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 9, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x68
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 8, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v20
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
-; GCN-NEXT:    v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 8, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 1, v27
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_or_b32_e32 v24, v24, v27
+; GCN-NEXT:    v_and_b32_e32 v24, 3, v24
+; GCN-NEXT:    v_or_b32_e32 v24, v24, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 12, v25
+; GCN-NEXT:    v_and_b32_sdwa v24, v24, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x67
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s7
+; GCN-NEXT:    v_or_b32_e32 v24, v25, v24
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 7, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x66
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 6, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 3, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 2, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x65
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s7
+; GCN-NEXT:    v_or_b32_e32 v25, v25, v26
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 5, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x64
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 4, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v26, v27, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 3, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x63
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s7
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 3, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x62
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 2, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 3, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 2, v27
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x61
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s7
+; GCN-NEXT:    v_or_b32_e32 v26, v26, v27
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 1, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x60
-; GCN-NEXT:    v_mov_b32_e32 v16, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v20
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 4, v18
-; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v18
+; GCN-NEXT:    v_mov_b32_e32 v23, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 1, v27
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_or_b32_e32 v23, v23, v27
+; GCN-NEXT:    v_and_b32_e32 v23, 3, v23
+; GCN-NEXT:    v_or_b32_e32 v23, v23, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 4, v25
+; GCN-NEXT:    v_and_b32_e32 v23, 15, v23
+; GCN-NEXT:    v_or_b32_e32 v23, v23, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x57
-; GCN-NEXT:    v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v17, s34
+; GCN-NEXT:    v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_mov_b32_e32 v24, s33
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x56
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s33
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_mov_b32_e32 v25, s31
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 3, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 2, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x55
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_mov_b32_e32 v18, s31
+; GCN-NEXT:    v_or_b32_e32 v24, v24, v25
+; GCN-NEXT:    v_mov_b32_e32 v25, s30
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x54
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s30
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_mov_b32_e32 v26, s29
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 1, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 3, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x53
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_mov_b32_e32 v18, s29
+; GCN-NEXT:    v_or_b32_e32 v24, v25, v24
+; GCN-NEXT:    v_mov_b32_e32 v25, s28
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x52
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s28
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_mov_b32_e32 v26, s27
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 3, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 2, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x51
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_mov_b32_e32 v19, s27
+; GCN-NEXT:    v_or_b32_e32 v25, v25, v26
+; GCN-NEXT:    v_mov_b32_e32 v26, s26
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x50
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_mov_b32_e32 v20, s26
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 15, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_mov_b32_e32 v27, s25
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v26, v27, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 3, v26
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 4, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 15, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5f
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s25
+; GCN-NEXT:    v_or_b32_e32 v24, v25, v24
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 7, s24
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5e
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 6, s24
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 3, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 2, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5d
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s25
+; GCN-NEXT:    v_or_b32_e32 v25, v25, v26
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 5, s24
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5c
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 4, s24
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v26, v27, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 3, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5b
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s25
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 3, s24
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5a
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 2, s24
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x58
-; GCN-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
+; GCN-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 3, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 2, v27
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x59
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s25
+; GCN-NEXT:    v_or_b32_e32 v26, v26, v27
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 1, s24
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
 ; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v20
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v27
 ; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
-; GCN-NEXT:    v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 12, v25
+; GCN-NEXT:    v_and_b32_sdwa v3, v3, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v3, v25, v3
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4f
-; GCN-NEXT:    v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_sdwa v24, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_lshrrev_b16_e64 v3, 15, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4e
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 14, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 14, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
 ; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 2, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4d
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 13, s6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v25
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 13, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4c
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 12, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 12, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 1, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 3, v25
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4b
-; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s6
+; GCN-NEXT:    v_or_b32_e32 v3, v25, v3
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 11, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4a
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 10, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 10, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 3, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 2, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x49
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 9, s6
+; GCN-NEXT:    v_or_b32_e32 v25, v25, v26
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 9, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x48
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 8, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 8, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v26, v27, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 3, v26
+; GCN-NEXT:    v_or_b32_e32 v25, v26, v25
 ; GCN-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
-; GCN-NEXT:    v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_and_b32_sdwa v25, v25, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x47
-; GCN-NEXT:    v_or_b32_e32 v18, v3, v18
+; GCN-NEXT:    v_or_b32_e32 v25, v3, v25
 ; GCN-NEXT:    v_lshrrev_b16_e64 v3, 7, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x46
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 6, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
 ; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 2, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x45
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v26
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 5, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x44
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 4, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_or_b32_e32 v26, v27, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 3, v26
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x43
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v3
+; GCN-NEXT:    v_or_b32_e32 v26, v26, v3
 ; GCN-NEXT:    v_lshrrev_b16_e64 v3, 3, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x42
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 2, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
 ; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 2, v27
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x41
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v27
+; GCN-NEXT:    v_lshrrev_b16_e64 v27, 1, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 64
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v27, 1, v27, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
+; GCN-NEXT:    v_lshlrev_b16_e32 v27, 1, v27
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v20
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v27
 ; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
-; GCN-NEXT:    v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v19
+; GCN-NEXT:    v_or_b32_sdwa v3, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 4, v26
 ; GCN-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 55
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v15
-; GCN-NEXT:    v_mov_b32_e32 v15, s24
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v22
+; GCN-NEXT:    v_mov_b32_e32 v22, s23
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 54
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v16, s23
+; GCN-NEXT:    v_cndmask_b32_e32 v22, 1, v22, vcc
+; GCN-NEXT:    v_mov_b32_e32 v23, s22
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 3, v22
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 2, v23
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 53
-; GCN-NEXT:    v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_mov_b32_e32 v16, s22
+; GCN-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v22, v22, v23
+; GCN-NEXT:    v_mov_b32_e32 v23, s21
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 52
-; GCN-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s21
+; GCN-NEXT:    v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_mov_b32_e32 v24, s20
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 1, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_or_b32_e32 v23, v24, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 3, v23
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 51
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_mov_b32_e32 v16, s20
+; GCN-NEXT:    v_or_b32_e32 v22, v23, v22
+; GCN-NEXT:    v_mov_b32_e32 v23, s19
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 50
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s19
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_mov_b32_e32 v24, s18
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 3, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 2, v24
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 49
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_mov_b32_e32 v17, s18
+; GCN-NEXT:    v_or_b32_e32 v23, v23, v24
+; GCN-NEXT:    v_mov_b32_e32 v24, s17
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 48
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s17
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_mov_b32_e32 v25, s16
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_or_b32_e32 v24, v25, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 3, v24
+; GCN-NEXT:    v_or_b32_e32 v23, v24, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 4, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 15, v23
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 63
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 7, s16
+; GCN-NEXT:    v_or_b32_e32 v22, v23, v22
+; GCN-NEXT:    v_lshrrev_b16_e64 v23, 7, s15
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 62
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 6, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 3, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 2, v24
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 61
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s16
+; GCN-NEXT:    v_or_b32_e32 v23, v23, v24
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 5, s15
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 60
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 4, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_or_b32_e32 v24, v25, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 3, v24
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 59
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 3, s16
+; GCN-NEXT:    v_or_b32_e32 v23, v24, v23
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 3, s15
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 58
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 2, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 56
-; GCN-NEXT:    v_mov_b32_e32 v14, s16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_mov_b32_e32 v21, s15
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 3, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 2, v25
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 57
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 1, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v18
-; GCN-NEXT:    v_and_b32_e32 v14, 3, v14
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
-; GCN-NEXT:    v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v14, v16, v14
+; GCN-NEXT:    v_or_b32_e32 v24, v24, v25
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 1, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v21, 1, v21, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 1, v25
+; GCN-NEXT:    v_or_b32_e32 v21, v21, v25
+; GCN-NEXT:    v_and_b32_e32 v21, 3, v21
+; GCN-NEXT:    v_or_b32_e32 v21, v21, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 12, v23
+; GCN-NEXT:    v_and_b32_sdwa v21, v21, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v21, v23, v21
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 47
-; GCN-NEXT:    v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 15, s5
+; GCN-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_lshrrev_b16_e64 v22, 15, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 46
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 14, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v23, 14, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v22, 1, v22, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 3, v22
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 2, v23
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 45
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s5
+; GCN-NEXT:    v_or_b32_e32 v22, v22, v23
+; GCN-NEXT:    v_lshrrev_b16_e64 v23, 13, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 44
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 12, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 12, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 1, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_or_b32_e32 v23, v24, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 3, v23
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 43
-; GCN-NEXT:    v_or_b32_e32 v14, v16, v14
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 11, s5
+; GCN-NEXT:    v_or_b32_e32 v22, v23, v22
+; GCN-NEXT:    v_lshrrev_b16_e64 v23, 11, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 42
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 10, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 10, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 3, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 2, v24
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 41
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 9, s5
+; GCN-NEXT:    v_or_b32_e32 v23, v23, v24
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 9, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 40
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 8, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
-; GCN-NEXT:    v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 8, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_or_b32_e32 v24, v25, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 3, v24
+; GCN-NEXT:    v_or_b32_e32 v23, v24, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 12, v22
+; GCN-NEXT:    v_and_b32_sdwa v23, v23, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 39
-; GCN-NEXT:    v_or_b32_e32 v16, v14, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 7, s5
+; GCN-NEXT:    v_or_b32_e32 v22, v22, v23
+; GCN-NEXT:    v_lshrrev_b16_e64 v23, 7, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 38
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 6, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 3, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 2, v24
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 37
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s5
+; GCN-NEXT:    v_or_b32_e32 v23, v23, v24
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 5, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 36
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 4, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_or_b32_e32 v24, v25, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 3, v24
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 35
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v14
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 3, s5
+; GCN-NEXT:    v_or_b32_e32 v23, v24, v23
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 3, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 34
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 2, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 3, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 2, v25
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 33
-; GCN-NEXT:    v_or_b32_e32 v18, v14, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 1, s5
+; GCN-NEXT:    v_or_b32_e32 v24, v24, v25
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 1, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 32
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v25, 1, v25, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v25, 1, v25
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v14
+; GCN-NEXT:    v_or_b32_e32 v1, v1, v25
 ; GCN-NEXT:    v_and_b32_e32 v1, 3, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
+; GCN-NEXT:    v_or_b32_e32 v1, v1, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 4, v23
 ; GCN-NEXT:    v_and_b32_e32 v1, 15, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v17
-; GCN-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v1, v1, v23
+; GCN-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 23
-; GCN-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v15, s15
+; GCN-NEXT:    v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    v_mov_b32_e32 v21, s4
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 22
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v16, s14
+; GCN-NEXT:    v_cndmask_b32_e32 v21, 1, v21, vcc
+; GCN-NEXT:    v_mov_b32_e32 v22, s14
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v22, 1, v22, vcc
+; GCN-NEXT:    v_and_b32_e32 v22, 1, v22
+; GCN-NEXT:    v_lshlrev_b16_e32 v21, 3, v21
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 2, v22
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 21
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_mov_b32_e32 v16, s13
+; GCN-NEXT:    v_or_b32_e32 v21, v21, v22
+; GCN-NEXT:    v_mov_b32_e32 v22, s13
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 20
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s12
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v22, 1, v22, vcc
+; GCN-NEXT:    v_mov_b32_e32 v23, s12
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 1, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_or_b32_e32 v22, v23, v22
+; GCN-NEXT:    v_and_b32_e32 v22, 3, v22
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 19
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_mov_b32_e32 v16, s11
+; GCN-NEXT:    v_or_b32_e32 v21, v22, v21
+; GCN-NEXT:    v_mov_b32_e32 v22, s11
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 18
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s10
+; GCN-NEXT:    v_cndmask_b32_e32 v22, 1, v22, vcc
+; GCN-NEXT:    v_mov_b32_e32 v23, s10
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 3, v22
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 2, v23
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 17
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
+; GCN-NEXT:    v_or_b32_e32 v22, v22, v23
+; GCN-NEXT:    v_mov_b32_e32 v23, s9
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 16
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s8
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v17, v19, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_mov_b32_e32 v26, s8
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 1, v23
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_or_b32_e32 v23, v26, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 3, v23
+; GCN-NEXT:    v_or_b32_e32 v22, v23, v22
+; GCN-NEXT:    v_lshlrev_b16_e32 v21, 4, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 15, v22
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 31
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 7, s1
+; GCN-NEXT:    v_or_b32_e32 v21, v22, v21
+; GCN-NEXT:    v_lshrrev_b16_e64 v22, 7, s1
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 30
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v23, 6, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v22, 1, v22, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 3, v22
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 2, v23
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 29
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s1
+; GCN-NEXT:    v_or_b32_e32 v22, v22, v23
+; GCN-NEXT:    v_lshrrev_b16_e64 v23, 5, s1
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 28
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 4, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v23, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v17, v19, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v26, 1, v26, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 1, v23
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_or_b32_e32 v23, v26, v23
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 27
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 3, s1
+; GCN-NEXT:    v_lshrrev_b16_e64 v26, 3, s1
+; GCN-NEXT:    v_and_b32_e32 v23, 3, v23
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 26
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v24, 2, s1
+; GCN-NEXT:    v_or_b32_e32 v22, v23, v22
+; GCN-NEXT:    v_cndmask_b32_e32 v23, 1, v26, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v24, vcc
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 24
-; GCN-NEXT:    v_mov_b32_e32 v18, s1
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    v_mov_b32_e32 v19, s1
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 25
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_lshrrev_b16_e64 v25, 1, s1
+; GCN-NEXT:    v_lshlrev_b16_e32 v23, 3, v23
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 2, v24
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
-; GCN-NEXT:    v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 15
-; GCN-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 15, s4
+; GCN-NEXT:    v_or_b32_e32 v23, v23, v24
+; GCN-NEXT:    v_cndmask_b32_e32 v24, 1, v25, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 14
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 13
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s4
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 12
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 11
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 11, s4
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
+; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 10
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 10, s4
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v19, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 9
-; GCN-NEXT:    v_lshrrev_b16_e64 v12, 9, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 8
-; GCN-NEXT:    v_lshrrev_b16_e64 v11, 8, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 7
-; GCN-NEXT:    v_lshrrev_b16_e64 v10, 7, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v11, 1, v11, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 6
-; GCN-NEXT:    v_lshrrev_b16_e64 v9, 6, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v10, 1, v10, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 5
-; GCN-NEXT:    v_lshrrev_b16_e64 v8, 5, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v9, 1, v9, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 4
-; GCN-NEXT:    v_lshrrev_b16_e64 v7, 4, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v8, 1, v8, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 3
-; GCN-NEXT:    v_lshrrev_b16_e64 v6, 3, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v7, 1, v7, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 2
-; GCN-NEXT:    v_lshrrev_b16_e64 v5, 2, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v6, 1, v6, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 1
-; GCN-NEXT:    v_lshrrev_b16_e64 v4, 1, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
+; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
+; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GCN-NEXT:    v_lshlrev_b16_e32 v12, 1, v12
 ; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
 ; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
@@ -1895,8 +1882,12 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GCN-NEXT:    v_lshlrev_b16_e32 v4, 1, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
+; GCN-NEXT:    v_or_b32_e32 v19, v19, v24
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
 ; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
 ; GCN-NEXT:    v_lshlrev_b16_e32 v10, 3, v10
 ; GCN-NEXT:    v_lshlrev_b16_e32 v9, 2, v9
@@ -1904,24 +1895,33 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    v_lshlrev_b16_e32 v6, 3, v6
 ; GCN-NEXT:    v_lshlrev_b16_e32 v5, 2, v5
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v4
-; GCN-NEXT:    v_or_b32_e32 v14, v17, v14
+; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
+; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
+; GCN-NEXT:    v_or_b32_e32 v13, v14, v13
 ; GCN-NEXT:    v_and_b32_e32 v11, 3, v11
 ; GCN-NEXT:    v_or_b32_e32 v9, v10, v9
 ; GCN-NEXT:    v_and_b32_e32 v7, 3, v7
 ; GCN-NEXT:    v_or_b32_e32 v5, v6, v5
 ; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v14
+; GCN-NEXT:    v_or_b32_e32 v19, v19, v23
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v17
+; GCN-NEXT:    v_or_b32_e32 v11, v11, v13
 ; GCN-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v5
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
-; GCN-NEXT:    v_and_b32_sdwa v11, v11, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_lshlrev_b16_e32 v22, 12, v22
+; GCN-NEXT:    v_and_b32_sdwa v19, v19, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
+; GCN-NEXT:    v_and_b32_sdwa v11, v11, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    v_lshlrev_b16_e32 v7, 4, v7
 ; GCN-NEXT:    v_and_b32_e32 v0, 15, v0
-; GCN-NEXT:    v_or_b32_e32 v11, v16, v11
+; GCN-NEXT:    v_or_b32_e32 v19, v22, v19
+; GCN-NEXT:    v_or_b32_e32 v11, v15, v11
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v7
+; GCN-NEXT:    v_or_b32_sdwa v19, v21, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 87913841012184c..ee9aeded6fcd0bb 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -3112,27 +3112,26 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s42, s4, 31
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s43, s7, 31
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s6, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s45, s17, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s46, s16, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s47, s19, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s48, s18, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s49, s21, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s50, s20, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s51, s23, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s52, s30, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s53, s31, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s52
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s52, s28, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s53
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s53, s29, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s52
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s52, s26, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s53
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s53, s27, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s52
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s52, s22, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s53
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s53, s25, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s45, s9, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s46, s8, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s47, s11, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s48, s17, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s49, s16, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s50, s19, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s51, s18, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s52, s21, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s53, s20, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s54, s23, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s55, s22, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s56, s25, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s57, s24, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s58, s27, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s59, s30, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s60, s31, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s59
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s59, s26, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s60
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s60, s29, 31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s30
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s28
@@ -3143,90 +3142,89 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s25
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s22
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s23
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v20, s20
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v22, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v24, s18
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v26, s19
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s17
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s16, s28, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s10, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s13, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s19, s12, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s15, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s14, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s60
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s18
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s14
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s15
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s59
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s58
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s17
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s16, s24, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s9, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s8, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s19, s11, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s10, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s13, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s12, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s23, s15, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s24, s14, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s53
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s12
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s13
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s57
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s56
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s14
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s15
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s52
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s51
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s10
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s11
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s55
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s54
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s13
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s50
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s49
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s47
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:144
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v21, s53
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v23, s52
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:160
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s9
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s46
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s45
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:128
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v20, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v22, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v25, s51
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v27, s50
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[36:39], 0 offset:144
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s23
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:112
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s22
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s21
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:96
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v24, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v26, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s49
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s48
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s2
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s3
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s20
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s19
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s20
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s18
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s17
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s44
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s43
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s42
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s41
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:32
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s40
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s35
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s34
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s33
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s18
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:96
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s17
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s47
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:80
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s46
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s45
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:64
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v21, s44
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v23, s43
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:48
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v25, s42
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v27, s41
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[36:39], 0 offset:32
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s40
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s35
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s34
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s33
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 4337782cb32e952..1f69b905ab95012 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -84,13 +84,11 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     S_NOP 0
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -191,14 +189,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23
     S_NOP 0, implicit %0, implicit %1
@@ -300,7 +296,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -308,7 +303,6 @@ body:             |
     S_NOP 0, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -408,7 +402,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -416,7 +409,6 @@ body:             |
     S_NOP 0, implicit %22, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -529,7 +521,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -537,14 +528,12 @@ body:             |
     S_NOP 0, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
     successors: %bb.3
 
     %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
     S_NOP 0
 
   bb.3:
-  ; predecessors: %bb.2
     successors: %bb.4
 
     %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
@@ -552,7 +541,6 @@ body:             |
     S_NOP 0, implicit %25
 
   bb.4:
-  ; predcessors: %bb.3
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -666,7 +654,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -674,7 +661,6 @@ body:             |
     S_NOP 0, implicit %23, implicit %22
 
   bb.2:
-  ; predcessors: %bb.1
     successors: %bb.3
 
     %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
@@ -682,7 +668,6 @@ body:             |
     S_NOP 0
 
   bb.3:
-  ; predecessors: %bb.2
     successors: %bb.4
 
     %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
@@ -690,7 +675,6 @@ body:             |
     S_NOP 0, implicit %25, implicit %26
 
   bb.4:
-  ; predcessors: %bb.3
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -949,14 +933,12 @@ body:             |
     undef %23.sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23.sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -1053,7 +1035,6 @@ body:             |
     undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
@@ -1062,7 +1043,6 @@ body:             |
     S_NOP 0, implicit %21
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -1581,7 +1561,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -1589,7 +1568,6 @@ body:             |
     S_NOP 0, implicit %24, implicit %25
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23
     S_NOP 0, implicit %0, implicit %1
@@ -2528,14 +2506,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
     successors: %bb.3
 
     S_NOP 0, implicit %23
@@ -2543,7 +2519,6 @@ body:             |
     S_NOP 0
 
   bb.3:
-  ; predecessors: %bb.2
     successors: %bb.4
 
     %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
@@ -2551,7 +2526,6 @@ body:             |
     S_NOP 0, implicit %26, implicit %27
 
   bb.4:
-  ; predcessors: %bb.3
 
     S_NOP 0, implicit %25
     S_NOP 0, implicit %0, implicit %1
@@ -2650,7 +2624,6 @@ body:             |
     %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -2658,7 +2631,6 @@ body:             |
     S_NOP 0, implicit %21
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -2759,7 +2731,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -2767,7 +2738,6 @@ body:             |
     S_NOP 0, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23
     S_NOP 0, implicit %0, implicit %1
@@ -5030,7 +5000,6 @@ body:             |
     %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5038,7 +5007,6 @@ body:             |
     S_NOP 0, implicit %21
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5137,14 +5105,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %23, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5242,7 +5208,6 @@ body:             |
     %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5250,7 +5215,6 @@ body:             |
     S_NOP 0, implicit %22
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5348,7 +5312,6 @@ body:             |
     %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5357,7 +5320,6 @@ body:             |
     S_NOP 0, implicit %22
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5456,7 +5418,6 @@ body:             |
     %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5466,7 +5427,6 @@ body:             |
     S_NOP 0, implicit %22
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5562,14 +5522,12 @@ body:             |
     %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %22, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5669,14 +5627,12 @@ body:             |
     undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23.sub1
     S_NOP 0, implicit %0, implicit %1
@@ -5779,14 +5735,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     DBG_VALUE %23, 0, 0
     S_NOP 0, implicit %23
@@ -5889,14 +5843,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23
     S_NOP 0, implicit %0, implicit %1



More information about the llvm-commits mailing list