[llvm] 555d8f4 - [AMDGPU] Bundle loads before post-RA scheduler

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 24 11:33:56 PST 2020


Author: Stanislav Mekhanoshin
Date: 2020-01-24T11:33:38-08:00
New Revision: 555d8f4ef5ebb2cdce2636af5102ff944da5fef8

URL: https://github.com/llvm/llvm-project/commit/555d8f4ef5ebb2cdce2636af5102ff944da5fef8
DIFF: https://github.com/llvm/llvm-project/commit/555d8f4ef5ebb2cdce2636af5102ff944da5fef8.diff

LOG: [AMDGPU] Bundle loads before post-RA scheduler

We are relying on atrificial DAG edges inserted by the
MemOpClusterMutation to keep loads and stores together in the
post-RA scheduler. This does not work all the time since it
allows to schedule a completely independent instruction in the
middle of the cluster.

Removed the DAG mutation and added pass to bundle already
clustered instructions. These bundles are unpacked before the
memory legalizer because it does not work with bundles but also
because it allows to insert waitcounts in the middle of a store
cluster.

Removing artificial edges also allows a more relaxed scheduling.

Differential Revision: https://reviews.llvm.org/D72737

Added: 
    llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
    llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
    llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
    llvm/test/CodeGen/AMDGPU/global-saddr.ll
    llvm/test/CodeGen/AMDGPU/half.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/load-lo16.ll
    llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
    llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/max.i16.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
    llvm/test/CodeGen/AMDGPU/memory_clause.ll
    llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/saddo.ll
    llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
    llvm/test/CodeGen/AMDGPU/scratch-simple.ll
    llvm/test/CodeGen/AMDGPU/select.f16.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
    llvm/test/CodeGen/AMDGPU/sign_extend.ll
    llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
    llvm/test/CodeGen/AMDGPU/v_madak_f16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 621a93d45879..1859cfaba5bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -59,6 +59,7 @@ FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
+FunctionPass *createSIPostRABundlerPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
                                                const TargetMachine *);
 FunctionPass *createAMDGPUUseNativeCallsPass();
@@ -229,6 +230,9 @@ extern char &SIInsertWaitcntsID;
 void initializeSIFormMemoryClausesPass(PassRegistry&);
 extern char &SIFormMemoryClausesID;
 
+void initializeSIPostRABundlerPass(PassRegistry&);
+extern char &SIPostRABundlerID;
+
 void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
 extern char &AMDGPUUnifyDivergentExitNodesID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 6d10ebd24782..75c4fba3e8fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -754,53 +754,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
 }
 
 namespace {
-struct MemOpClusterMutation : ScheduleDAGMutation {
-  const SIInstrInfo *TII;
-
-  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
-
-  void apply(ScheduleDAGInstrs *DAG) override {
-    SUnit *SUa = nullptr;
-    // Search for two consequent memory operations and link them
-    // to prevent scheduler from moving them apart.
-    // In DAG pre-process SUnits are in the original order of
-    // the instructions before scheduling.
-    for (SUnit &SU : DAG->SUnits) {
-      MachineInstr &MI2 = *SU.getInstr();
-      if (!MI2.mayLoad() && !MI2.mayStore()) {
-        SUa = nullptr;
-        continue;
-      }
-      if (!SUa) {
-        SUa = &SU;
-        continue;
-      }
-
-      MachineInstr &MI1 = *SUa->getInstr();
-      if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
-          (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
-          (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
-          (TII->isDS(MI1)   && TII->isDS(MI2))) {
-        SU.addPredBarrier(SUa);
-
-        for (const SDep &SI : SU.Preds) {
-          if (SI.getSUnit() != SUa)
-            SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
-        }
-
-        if (&SU != &DAG->ExitSU) {
-          for (const SDep &SI : SUa->Succs) {
-            if (SI.getSUnit() != &SU)
-              SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
-          }
-        }
-      }
-
-      SUa = &SU;
-    }
-  }
-};
-
 struct FillMFMAShadowMutation : ScheduleDAGMutation {
   const SIInstrInfo *TII;
 
@@ -927,7 +880,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
 
 void GCNSubtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0b0d16967324..6d1b72a70392 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -235,6 +235,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIOptimizeExecMaskingPass(*PR);
   initializeSIPreAllocateWWMRegsPass(*PR);
   initializeSIFormMemoryClausesPass(*PR);
+  initializeSIPostRABundlerPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
   initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -980,6 +981,7 @@ void GCNPassConfig::addPostRegAlloc() {
 }
 
 void GCNPassConfig::addPreSched2() {
+  addPass(&SIPostRABundlerID);
 }
 
 void GCNPassConfig::addPreEmitPass() {

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 3bbf2c346954..b52fcdc53638 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -118,6 +118,7 @@ add_llvm_target(AMDGPUCodeGen
   SIOptimizeExecMasking.cpp
   SIOptimizeExecMaskingPreRA.cpp
   SIPeepholeSDWA.cpp
+  SIPostRABundler.cpp
   SIRegisterInfo.cpp
   SIRemoveShortExecBranches.cpp
   SIShrinkInstructions.cpp

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c3156f948a3c..2ac5aac6d5ce 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1663,22 +1663,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
-  case TargetOpcode::BUNDLE: {
-    if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
-      return false;
-
-    // If it is a load it must be a memory clause
-    for (MachineBasicBlock::instr_iterator I = MI.getIterator();
-         I->isBundledWithSucc(); ++I) {
-      I->unbundleFromSucc();
-      for (MachineOperand &MO : I->operands())
-        if (MO.isReg())
-          MO.setIsInternalRead(false);
-    }
-
-    MI.eraseFromParent();
-    break;
-  }
   }
   return true;
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index e914573306ae..242211d8423b 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1289,6 +1289,21 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+
+      if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
+        MachineBasicBlock::instr_iterator II(MI->getIterator());
+        for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
+             I != E && I->isBundledWithPred(); ++I) {
+          I->unbundleFromPred();
+          for (MachineOperand &MO : I->operands())
+            if (MO.isReg())
+              MO.setIsInternalRead(false);
+        }
+
+        MI->eraseFromParent();
+        MI = II->getIterator();
+      }
+
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 

diff  --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
new file mode 100644
index 000000000000..acb0fbfe812d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -0,0 +1,138 @@
+//===-- SIPostRABundler.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass creates bundles of memory instructions to protect adjacent loads
+/// and stores from beeing rescheduled apart from each other post-RA.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-post-ra-bundler"
+
+namespace {
+
+class SIPostRABundler : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIPostRABundler() : MachineFunctionPass(ID) {
+    initializeSIPostRABundlerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI post-RA bundler";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  const SIRegisterInfo *TRI;
+
+  SmallSet<Register, 16> Defs;
+
+  bool isDependentLoad(const MachineInstr &MI) const;
+
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)
+
+char SIPostRABundler::ID = 0;
+
+char &llvm::SIPostRABundlerID = SIPostRABundler::ID;
+
+FunctionPass *llvm::createSIPostRABundlerPass() {
+  return new SIPostRABundler();
+}
+
+bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
+  if (!MI.mayLoad())
+    return false;
+
+  for (const MachineOperand &Op : MI.explicit_operands()) {
+    if (!Op.isReg())
+      continue;
+    Register Reg = Op.getReg();
+    for (const Register Def : Defs)
+      if (TRI->regsOverlap(Reg, Def))
+        return true;
+  }
+
+  return false;
+}
+
+bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+  bool Changed = false;
+  const unsigned MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
+                            SIInstrFlags::SMRD | SIInstrFlags::DS |
+                            SIInstrFlags::FLAT | SIInstrFlags::MIMG;
+
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::instr_iterator Next;
+    MachineBasicBlock::instr_iterator B = MBB.instr_begin();
+    MachineBasicBlock::instr_iterator E = MBB.instr_end();
+    for (auto I = B; I != E; I = Next) {
+      Next = std::next(I);
+
+      if (I->isBundled() || !I->mayLoadOrStore() ||
+          B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
+          (B->getDesc().TSFlags & MemFlags) !=
+          (I->getDesc().TSFlags & MemFlags) ||
+          isDependentLoad(*I)) {
+
+        if (B != I) {
+          if (std::next(B) != I) {
+            finalizeBundle(MBB, B, I);
+            Changed = true;
+          }
+          Next = I;
+        }
+
+        B = Next;
+        Defs.clear();
+        continue;
+      }
+
+      if (I->getNumExplicitDefs() == 0)
+        continue;
+
+      Defs.insert(I->defs().begin()->getReg());
+    }
+
+    if (B != E && std::next(B) != E) {
+      finalizeBundle(MBB, B, E);
+      Changed = true;
+    }
+
+    Defs.clear();
+  }
+
+  return Changed;
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 7b6d247d0023..4f6a20d4af6a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -807,9 +807,9 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; MOVREL-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; MOVREL-NEXT:    v_mov_b32_e32 v34, s19
-; MOVREL-NEXT:    v_mov_b32_e32 v33, s18
 ; MOVREL-NEXT:    v_mov_b32_e32 v32, s17
 ; MOVREL-NEXT:    v_mov_b32_e32 v31, s16
+; MOVREL-NEXT:    v_mov_b32_e32 v33, s18
 ; MOVREL-NEXT:    v_mov_b32_e32 v30, s15
 ; MOVREL-NEXT:    v_mov_b32_e32 v29, s14
 ; MOVREL-NEXT:    v_mov_b32_e32 v28, s13
@@ -1113,10 +1113,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
 ; MOVREL-NEXT:    v_movreld_b32_e32 v2, v0
 ; MOVREL-NEXT:    v_movreld_b32_e32 v3, v1
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
-; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off
 ; MOVREL-NEXT:    s_endpgm
 entry:
@@ -2053,24 +2053,24 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
 ; MOVREL-NEXT:    s_movreld_b64 s[2:3], s[18:19]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
 ; MOVREL-NEXT:    v_mov_b32_e32 v4, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v8, s8
-; MOVREL-NEXT:    v_mov_b32_e32 v12, s12
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s1
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s8
 ; MOVREL-NEXT:    v_mov_b32_e32 v5, s5
 ; MOVREL-NEXT:    v_mov_b32_e32 v6, s6
 ; MOVREL-NEXT:    v_mov_b32_e32 v7, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s12
 ; MOVREL-NEXT:    v_mov_b32_e32 v9, s9
 ; MOVREL-NEXT:    v_mov_b32_e32 v10, s10
 ; MOVREL-NEXT:    v_mov_b32_e32 v11, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v13, s13
 ; MOVREL-NEXT:    v_mov_b32_e32 v14, s14
 ; MOVREL-NEXT:    v_mov_b32_e32 v15, s15
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
-; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
 ; MOVREL-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
index d0b43b93d431..b1e389053da2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -374,10 +374,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v4
-; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
@@ -399,10 +399,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
@@ -695,10 +695,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v4
-; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
@@ -720,10 +720,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index 84e4c129325e..454034b0a428 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -385,10 +385,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v4
-; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
@@ -410,10 +410,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
@@ -1304,10 +1304,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v4
-; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
@@ -1329,10 +1329,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
@@ -1861,8 +1861,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0,
 ; CI-NEXT:    ds_inc_rtn_u32 v4, v3, v2
 ; CI-NEXT:    ds_inc_rtn_u32 v5, v3, v2
 ; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v1, s7
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
+; CI-NEXT:    v_mov_b32_e32 v1, s7
 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
 ; CI-NEXT:    flat_store_dword v[2:3], v4
 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
@@ -1881,8 +1881,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0,
 ; VI-NEXT:    ds_inc_rtn_u32 v4, v3, v2
 ; VI-NEXT:    ds_inc_rtn_u32 v5, v3, v2
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NEXT:    flat_store_dword v[2:3], v4
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 84d39acb17fe..7ba2d4f59f86 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -275,8 +275,8 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ;
 ; GFX1064-LABEL: add_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
@@ -1832,8 +1832,8 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ;
 ; GFX1064-LABEL: sub_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index 42192289aedd..e123d80fb956 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -263,13 +263,15 @@ entry:
 
 
 ; GCN-NOT: s_add_u32 s32, s32, 0x800
-; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}}
 
 ; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8
 ; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12
 ; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
 ; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
 
+; GCN-NOT: s_add_u32 s32, s32, 0x800
+; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}}
+
 ; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
 ; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a627bf004658..faf15d0981c7 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -668,8 +668,6 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32:
-; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}}
-
 ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
 ; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8
@@ -678,18 +676,17 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8
 ; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12
 
-; GCN-NOT: s_add_u32 [[SP]],
-
 ; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
 ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
 
-; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
-; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
-
-
 ; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
 ; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
 
+; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}}
+
+; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
+; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
+
 ; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}}
 ; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4
 

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 61528cf504f6..e47390b48751 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -400,7 +400,7 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
 ; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s34{{$}}
+; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34{{$}}
 
 ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
 

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 3529db39dff2..77f03d2222dd 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -114,10 +114,10 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
 ; SI-NEXT:    s_mov_b32 s9, s5
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    s_mov_b32 s2, s10
 ; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -138,11 +138,11 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_mov_b32 s12, s2
 ; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s2, s10
+; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    s_mov_b32 s8, s4
 ; VI-NEXT:    s_mov_b32 s9, s5
 ; VI-NEXT:    s_mov_b32 s15, s11
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
@@ -173,14 +173,14 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
 ; SI-NEXT:    s_mov_b32 s18, s14
 ; SI-NEXT:    s_mov_b32 s19, s15
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s12, s6
-; SI-NEXT:    s_mov_b32 s13, s7
 ; SI-NEXT:    s_mov_b32 s16, s2
 ; SI-NEXT:    s_mov_b32 s17, s3
-; SI-NEXT:    s_mov_b32 s6, s14
-; SI-NEXT:    s_mov_b32 s7, s15
 ; SI-NEXT:    s_mov_b32 s2, s14
 ; SI-NEXT:    s_mov_b32 s3, s15
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s6, s14
+; SI-NEXT:    s_mov_b32 s7, s15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[16:19], 0
@@ -200,16 +200,16 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s8, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s8, s6
-; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    s_mov_b32 s12, s2
 ; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s2, s10
+; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index a864cac13940..6c402306067c 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -274,14 +274,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_movk_i32 s12, 0xff
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 9, v1
@@ -293,7 +293,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v6
 ; SI-NEXT:    v_and_b32_e32 v7, s12, v7
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v6, v7
@@ -315,6 +314,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; VI-NEXT:    v_mov_b32_e32 v4, 9
+; VI-NEXT:    s_movk_i32 s8, 0x900
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -324,7 +324,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
-; VI-NEXT:    s_movk_i32 s8, 0x900
 ; VI-NEXT:    v_mov_b32_e32 v6, s8
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
@@ -384,11 +383,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:24
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; SI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v3
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v3
 ; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
@@ -414,6 +413,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ubyte v8, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v9, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v10, v[4:5]
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
@@ -422,16 +422,15 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v8, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; VI-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
+; VI-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v10
 ; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
index 2a405352f253..7ea1dae20d3f 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -63,7 +63,7 @@ define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)*
 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
-; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
+; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
 ; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; GCN: s_endpgm
 define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
@@ -91,7 +91,7 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrsp
 ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
-; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
+; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
 ; GCN: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; GCN: s_endpgm
 define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll
index 4df1ad683df6..82f03472e1b7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll
@@ -4,8 +4,8 @@
 
 ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
 ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}}
 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
 

diff  --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 1908015f4770..d54058eec30c 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -312,7 +312,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
-; SI: v_cvt_f32_f16_e32
 
 ; GCN: flat_store_dwordx4
 
@@ -326,6 +325,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
 
 ; VI: v_cvt_f32_f16_e32
 ; VI: v_cvt_f32_f16_sdwa

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 1c97e2a2e2f8..b917e86821a5 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -2552,15 +2552,15 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX7-NEXT:    s_and_b32 s4, s4, s8
 ; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_and_b32 s4, s4, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
@@ -2674,16 +2674,16 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: notsdot2_sext8:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_mov_b32 s10, s2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, s6
 ; GFX7-NEXT:    s_mov_b32 s9, s7
-; GFX7-NEXT:    s_mov_b32 s11, s3
 ; GFX7-NEXT:    s_mov_b32 s6, s2
 ; GFX7-NEXT:    s_mov_b32 s7, s3
+; GFX7-NEXT:    s_mov_b32 s11, s3
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
@@ -2704,11 +2704,11 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(0)
@@ -2731,11 +2731,11 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX9-NODL-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
@@ -2759,11 +2759,11 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX9-DL-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
@@ -2788,13 +2788,13 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX10-DL-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 28eb2717bf44..6f9ba28e4ac5 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -183,16 +183,16 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_sext_i32_i8 s6, s4
-; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x80008
 ; GFX7-NEXT:    s_sext_i32_i8 s7, s5
 ; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80008
 ; GFX7-NEXT:    s_and_b32 s7, s7, s8
 ; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x80010
+; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x80008
 ; GFX7-NEXT:    s_and_b32 s10, s10, s8
 ; GFX7-NEXT:    s_and_b32 s6, s6, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
@@ -357,19 +357,19 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
 ; GFX7-NEXT:    s_and_b32 s7, s6, s5
-; GFX7-NEXT:    s_and_b32 s5, s4, s5
 ; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
+; GFX7-NEXT:    s_and_b32 s5, s4, s5
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
 ; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
@@ -899,16 +899,16 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_ashr_i32 s6, s4, 24
-; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x80010
 ; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80010
 ; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80008
 ; GFX7-NEXT:    s_ashr_i32 s9, s5, 24
 ; GFX7-NEXT:    s_sext_i32_i8 s5, s5
+; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x80010
 ; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
 ; GFX7-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
@@ -1048,9 +1048,9 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x80000
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x80000

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 6172d54b8e0d..33cbe55f32e4 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -184,19 +184,19 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
 ; GFX7-NEXT:    s_and_b32 s7, s6, s5
-; GFX7-NEXT:    s_and_b32 s5, s4, s5
 ; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
+; GFX7-NEXT:    s_and_b32 s5, s4, s5
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
 ; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
@@ -354,19 +354,19 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
 ; GFX7-NEXT:    s_and_b32 s7, s6, s5
-; GFX7-NEXT:    s_and_b32 s5, s4, s5
 ; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
+; GFX7-NEXT:    s_and_b32 s5, s4, s5
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
 ; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
@@ -518,15 +518,15 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_and_b32 s7, s4, s8
-; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x80008
 ; GFX7-NEXT:    s_and_b32 s6, s5, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x80008
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
@@ -615,9 +615,9 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s3, s0, s2
 ; GFX10-DL-NEXT:    s_and_b32 s2, s1, s2
@@ -658,14 +658,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_and_b32 s6, s4, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7-NEXT:    s_and_b32 s7, s5, s8
 ; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
 ; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -821,16 +821,16 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_and_b32 s6, s4, s8
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
 ; GFX7-NEXT:    s_and_b32 s7, s5, s8
 ; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
 ; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
@@ -950,9 +950,9 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x80008
@@ -1548,9 +1548,9 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80008
@@ -1940,9 +1940,9 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s0
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -2002,17 +2002,17 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s6, s4, 0x80008
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX7-NEXT:    s_lshr_b32 s12, s5, 24
-; GFX7-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
+; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX7-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX7-NEXT:    s_lshr_b32 s9, s4, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX7-NEXT:    s_mul_i32 s4, s4, s5
@@ -2160,9 +2160,9 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 9ce54565b109..e5f6023618e1 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -260,15 +260,15 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s2, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_i32 s8, s1, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
 ; GFX7-NEXT:    s_bfe_i32 s9, s2, 0x40000
 ; GFX7-NEXT:    s_bfe_i32 s11, s2, 0x40004
 ; GFX7-NEXT:    s_and_b32 s9, s9, s0
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
 ; GFX7-NEXT:    s_bfe_i32 s13, s2, 0x40008
 ; GFX7-NEXT:    s_and_b32 s11, s11, s0
 ; GFX7-NEXT:    s_and_b32 s8, s8, s0
@@ -478,9 +478,9 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 12
 ; GFX10-DL-NEXT:    s_lshr_b32 s5, s1, 12
@@ -594,15 +594,15 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_movk_i32 s0, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s2, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_i32 s8, s1, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
 ; GFX7-NEXT:    s_bfe_i32 s9, s2, 0x40000
 ; GFX7-NEXT:    s_bfe_i32 s11, s2, 0x40004
 ; GFX7-NEXT:    s_and_b32 s9, s9, s0
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
 ; GFX7-NEXT:    s_bfe_i32 s13, s2, 0x40008
 ; GFX7-NEXT:    s_and_b32 s11, s11, s0
 ; GFX7-NEXT:    s_and_b32 s8, s8, s0
@@ -821,9 +821,9 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 12
 ; GFX10-DL-NEXT:    s_lshr_b32 s5, s1, 12
@@ -1844,9 +1844,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s5, s0, 15
 ; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40004
@@ -1945,14 +1945,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s8, s[10:11], 0x0
-; GFX7-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_i32 s9, s2, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s10, s2, 0x40004
 ; GFX7-NEXT:    s_bfe_i32 s16, s8, 0x40000
 ; GFX7-NEXT:    s_bfe_i32 s17, s8, 0x40004
 ; GFX7-NEXT:    s_bfe_i32 s18, s8, 0x40008
@@ -1961,8 +1960,9 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_bfe_i32 s21, s8, 0x40014
 ; GFX7-NEXT:    s_bfe_i32 s22, s8, 0x40018
 ; GFX7-NEXT:    s_ashr_i32 s8, s8, 28
-; GFX7-NEXT:    v_mov_b32_e32 v7, s17
 ; GFX7-NEXT:    v_mov_b32_e32 v8, s16
+; GFX7-NEXT:    s_bfe_i32 s10, s2, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v7, s17
 ; GFX7-NEXT:    s_bfe_i32 s11, s2, 0x40008
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s18
 ; GFX7-NEXT:    s_bfe_i32 s12, s2, 0x4000c
@@ -2292,9 +2292,9 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 4
 ; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 4

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 79f827cc230c..ae4086adb826 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -14,11 +14,10 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
 ; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
 ; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s16, s10, 0x40014
@@ -27,6 +26,7 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
 ; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
 ; GFX7-NEXT:    s_and_b32 s10, s10, 15
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
 ; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
@@ -59,11 +59,10 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
 ; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
 ; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40014
@@ -72,6 +71,7 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
 ; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
 ; GFX8-NEXT:    s_and_b32 s6, s6, 15
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
 ; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
@@ -106,11 +106,10 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
 ; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
 ; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
@@ -119,6 +118,7 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
 ; GFX9-NEXT:    s_and_b32 s6, s6, 15
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
 ; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
@@ -259,12 +259,11 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
@@ -273,6 +272,7 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
 ; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
 ; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
 ; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
@@ -451,9 +451,9 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
@@ -561,12 +561,11 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
@@ -575,6 +574,7 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
 ; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
 ; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
 ; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
@@ -753,9 +753,9 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
@@ -863,12 +863,11 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
@@ -877,6 +876,7 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
 ; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
 ; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
 ; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
@@ -1065,9 +1065,9 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
@@ -1162,12 +1162,11 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
@@ -1176,6 +1175,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
 ; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
 ; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
 ; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
@@ -1364,9 +1364,9 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
@@ -1459,11 +1459,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
 ; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
 ; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
 ; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
@@ -1472,6 +1471,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_bfe_u32 s18, s10, 0x4000c
 ; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
 ; GFX7-NEXT:    s_and_b32 s10, s10, 15
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
 ; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
@@ -1506,11 +1506,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
 ; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
 ; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
@@ -1519,6 +1518,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x4000c
 ; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
 ; GFX8-NEXT:    s_and_b32 s6, s6, 15
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
 ; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
@@ -1555,11 +1555,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
 ; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
@@ -1568,6 +1567,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
 ; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
 ; GFX9-NEXT:    s_and_b32 s6, s6, 15
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
 ; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
@@ -1604,11 +1604,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s18, s6, 0x40004
 ; GFX9-DL-NEXT:    s_lshr_b32 s7, s6, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40018
@@ -1617,6 +1616,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
 ; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x40008
 ; GFX9-DL-NEXT:    s_and_b32 s6, s6, 15
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
 ; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40010
@@ -1767,11 +1767,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
 ; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
 ; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s16, s10, 0x40014
@@ -1780,6 +1779,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
 ; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
 ; GFX7-NEXT:    s_and_b32 s10, s10, 15
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
 ; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
@@ -1812,11 +1812,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
 ; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
 ; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40014
@@ -1825,6 +1824,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
 ; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
 ; GFX8-NEXT:    s_and_b32 s6, s6, 15
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
 ; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
@@ -1859,11 +1859,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
 ; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
 ; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
@@ -1872,6 +1871,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
 ; GFX9-NEXT:    s_and_b32 s6, s6, 15
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
 ; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
 ; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
@@ -1977,15 +1977,13 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40004
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x4000c
 ; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40004
 ; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s18
 ; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
@@ -1993,6 +1991,8 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_and_b32 s19, s1, 15
 ; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
 ; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s13, v2
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s11, v4
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
@@ -2193,9 +2193,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40004
@@ -2278,21 +2278,21 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40004
 ; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x4000c
 ; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40004
 ; GFX7-NEXT:    s_lshr_b32 s18, s1, 28
-; GFX7-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX7-NEXT:    v_mov_b32_e32 v8, s14
 ; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40008
 ; GFX7-NEXT:    s_and_b32 s17, s1, 15
 ; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX7-NEXT:    s_lshr_b32 s11, s0, 28
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s18
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s11, v4
@@ -2555,9 +2555,9 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40004
 ; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
@@ -2650,12 +2650,11 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
@@ -2664,6 +2663,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
 ; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
 ; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
 ; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
 ; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
@@ -2852,9 +2852,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 04b8be9abf2c..7f981412c573 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -16,9 +16,9 @@ define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out,
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x40a00000
-; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
@@ -31,9 +31,9 @@ define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out,
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x40a00000
-; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
@@ -156,9 +156,9 @@ define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_movk_i32 s4, 0x3e7
-; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
@@ -171,9 +171,9 @@ define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_movk_i32 s4, 0x3e7
-; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
@@ -505,9 +505,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
 ; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
@@ -538,9 +538,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-NEXT:    v_mov_b32_e32 v1, s9
@@ -690,8 +690,8 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s11
-; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
 ; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
@@ -1628,6 +1628,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
 ; SI-NEXT:    s_and_b32 s4, s4, 7
 ; SI-NEXT:    s_lshl_b32 s4, s4, 3
 ; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    v_mov_b32_e32 v12, s24
+; SI-NEXT:    v_mov_b32_e32 v13, s25
+; SI-NEXT:    v_mov_b32_e32 v14, s26
+; SI-NEXT:    v_mov_b32_e32 v15, s27
 ; SI-NEXT:    v_mov_b32_e32 v2, s14
 ; SI-NEXT:    v_mov_b32_e32 v3, s15
 ; SI-NEXT:    v_mov_b32_e32 v4, s16
@@ -1638,10 +1642,6 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
 ; SI-NEXT:    v_mov_b32_e32 v9, s21
 ; SI-NEXT:    v_mov_b32_e32 v10, s22
 ; SI-NEXT:    v_mov_b32_e32 v11, s23
-; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_mov_b32_e32 v13, s25
-; SI-NEXT:    v_mov_b32_e32 v14, s26
-; SI-NEXT:    v_mov_b32_e32 v15, s27
 ; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
 ; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
@@ -1674,6 +1674,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
 ; VI-NEXT:    s_and_b32 s4, s4, 7
 ; VI-NEXT:    s_lshl_b32 s4, s4, 3
 ; VI-NEXT:    v_mov_b32_e32 v1, s13
+; VI-NEXT:    v_mov_b32_e32 v12, s24
+; VI-NEXT:    v_mov_b32_e32 v13, s25
+; VI-NEXT:    v_mov_b32_e32 v14, s26
+; VI-NEXT:    v_mov_b32_e32 v15, s27
 ; VI-NEXT:    v_mov_b32_e32 v2, s14
 ; VI-NEXT:    v_mov_b32_e32 v3, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s16
@@ -1684,10 +1688,6 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v9, s21
 ; VI-NEXT:    v_mov_b32_e32 v10, s22
 ; VI-NEXT:    v_mov_b32_e32 v11, s23
-; VI-NEXT:    v_mov_b32_e32 v12, s24
-; VI-NEXT:    v_mov_b32_e32 v13, s25
-; VI-NEXT:    v_mov_b32_e32 v14, s26
-; VI-NEXT:    v_mov_b32_e32 v15, s27
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index f31c1b5c87fb..627f68913866 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -579,8 +579,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff0000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -599,8 +599,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; VI-LABEL: v_insertelement_v2i16_0_reghi:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -619,8 +619,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; CI-LABEL: v_insertelement_v2i16_0_reghi:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1165,8 +1165,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3e703e7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
@@ -1186,8 +1186,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3e703e7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1207,8 +1207,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, 0x3e703e7
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1330,8 +1330,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; GFX9-LABEL: v_insertelement_v4f16_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -1349,8 +1349,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; VI-LABEL: v_insertelement_v4f16_0:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1369,8 +1369,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; CI-LABEL: v_insertelement_v4f16_0:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1401,8 +1401,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 ; GFX9-LABEL: v_insertelement_v4f16_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
@@ -1420,8 +1420,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 ; VI-LABEL: v_insertelement_v4f16_1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1440,8 +1440,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 ; CI-LABEL: v_insertelement_v4f16_1:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1472,8 +1472,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; GFX9-LABEL: v_insertelement_v4f16_2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -1491,8 +1491,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; VI-LABEL: v_insertelement_v4f16_2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1511,8 +1511,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; CI-LABEL: v_insertelement_v4f16_2:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1543,8 +1543,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; GFX9-LABEL: v_insertelement_v4f16_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
@@ -1562,8 +1562,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; VI-LABEL: v_insertelement_v4f16_3:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1582,8 +1582,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; CI-LABEL: v_insertelement_v4f16_3:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1614,8 +1614,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 ; GFX9-LABEL: v_insertelement_v4i16_2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -1633,8 +1633,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 ; VI-LABEL: v_insertelement_v4i16_2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1653,8 +1653,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 ; CI-LABEL: v_insertelement_v4i16_2:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1686,9 +1686,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
 ; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -1712,10 +1712,10 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
 ; VI-NEXT:    s_mov_b32 s4, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1723,16 +1723,15 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    s_mov_b32 s5, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s1, s6, s4
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    s_lshl_b32 s0, s1, 16
 ; VI-NEXT:    s_or_b32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
 ; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[4:5]
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_bfi_b32 v1, v5, s0, v1
 ; VI-NEXT:    v_bfi_b32 v0, v4, s0, v0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1741,27 +1740,26 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s6, s[4:5], 0x4
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_load_dword s6, s[4:5], 0x4
 ; CI-NEXT:    s_mov_b32 s4, 0xffff
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    s_mov_b32 s5, 0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshl_b32 s2, s6, 16
 ; CI-NEXT:    s_and_b32 s3, s6, s4
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_or_b32 s1, s3, s2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
 ; CI-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_bfi_b32 v1, v5, s1, v1
 ; CI-NEXT:    v_bfi_b32 v0, v4, s1, v0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1783,8 +1781,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1809,8 +1807,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_mov_b32 s6, 0xffff
 ; VI-NEXT:    s_mov_b32 s7, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1837,8 +1835,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_mov_b32 s6, 0xffff
 ; CI-NEXT:    s_mov_b32 s7, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 4f32340a0ff8..a53002197203 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -15,13 +15,13 @@ define amdgpu_kernel void @maxnum_f16(
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s6
 ; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
 ; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
 ; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
 ; SI-NEXT:    s_mov_b32 s8, s4
@@ -49,9 +49,9 @@ define amdgpu_kernel void @maxnum_f16(
 ; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -74,9 +74,9 @@ define amdgpu_kernel void @maxnum_f16(
 ; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_mov_b32 s4, s6
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_mov_b32 s6, s2
 ; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -460,9 +460,9 @@ define amdgpu_kernel void @maxnum_v3f16(
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
-; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s1, s6, 16
 ; SI-NEXT:    s_lshr_b32 s4, s8, 16
@@ -478,9 +478,9 @@ define amdgpu_kernel void @maxnum_v3f16(
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_max_f32_e32 v1, v1, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 923383cf5a1a..fcb483f18c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -15,13 +15,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s6
 ; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
 ; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
 ; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
 ; SI-NEXT:    s_mov_b32 s8, s4
@@ -49,9 +49,9 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -74,9 +74,9 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_mov_b32 s4, s6
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_mov_b32 s6, s2
 ; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -513,9 +513,9 @@ define amdgpu_kernel void @minnum_v3f16(
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
-; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s1, s6, 16
 ; SI-NEXT:    s_lshr_b32 s4, s8, 16
@@ -531,9 +531,9 @@ define amdgpu_kernel void @minnum_v3f16(
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_min_f32_e32 v1, v1, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 8d04002d2770..3801da79f1c4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -427,9 +427,9 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; CI-NEXT:    v_mov_b32_e32 v13, s13
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
 ; CI-NEXT:    v_bfi_b32 v12, s2, v12, v13
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_add_f64 v[4:5], v[10:11], v[4:5]
 ; CI-NEXT:    v_add_f64 v[0:1], v[8:9], v[0:1]
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16

diff  --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 08ab3faf4c82..22d68bad99ae 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -545,8 +545,8 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in,
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
-; GFX803-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    ds_write_b16 v2, v0
@@ -594,10 +594,10 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in,
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
-; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    ds_write_b16 v3, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -618,8 +618,8 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v0, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    ds_write_b16 v2, v0
 ; GFX900-NEXT:    ds_write_b16 v3, v5
@@ -632,8 +632,8 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX906-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    ds_write_b16 v2, v0
 ; GFX906-NEXT:    ds_write_b16 v3, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index f6611ec26a82..ef646d6be267 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -42,10 +42,10 @@ entry:
 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 ; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
-; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
 
 ; GCN: s_barrier
 
+; SI:     v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7

diff  --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 95e88ebfc307..a50a04a4cd06 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -145,8 +145,8 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; GFX9-LABEL: lshr_v_s_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
@@ -163,8 +163,8 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; VI-LABEL: lshr_v_s_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
@@ -220,8 +220,8 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; GFX9-LABEL: lshr_s_v_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
@@ -238,8 +238,8 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; VI-LABEL: lshr_s_v_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index b370e5b9acd7..178d111dd70a 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -169,14 +169,14 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_short_d16 v5, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_short_d16 v6, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v7, v[0:1], off
+; GFX9-NEXT:    global_load_short_d16 v5, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_pk_max_i16 v3, v6, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_i16 v2, v7, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
index 21c863da44f9..77b9fcb3915e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
@@ -885,7 +885,8 @@ entry:
 ; GFX89:         flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GFX10WGP:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GFX10CU-NOT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX89:         s_waitcnt lgkmcnt(0){{$}}
+; GFX89:         s_waitcnt vmcnt(0){{$}}
 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX89-NOT:     buffer_wbinvl1_vol
 ; GFX10WGP-NEXT: buffer_gl0_inv
@@ -912,7 +913,8 @@ entry:
 ; GFX89:         flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GFX10WGP:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GFX10CU:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX89:         s_waitcnt lgkmcnt(0){{$}}
+; GFX89:         s_waitcnt vmcnt(0){{$}}
 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX89-NOT:     buffer_wbinvl1_vol
 ; GFX10WGP-NEXT: buffer_gl0_inv

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index c721de658129..51716847743c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -5,10 +5,12 @@ define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocaptu
 ; GCN-LABEL: vector_clause:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v[16:17], s[2:3]
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v[16:17], s[2:3] offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v[16:17], s[2:3] offset:32
@@ -64,6 +66,7 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v8, s8
 ; GCN-NEXT:    v_mov_b32_e32 v13, s19
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -71,15 +74,14 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
 ; GCN-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-NEXT:    v_mov_b32_e32 v6, s6
 ; GCN-NEXT:    v_mov_b32_e32 v7, s7
-; GCN-NEXT:    v_mov_b32_e32 v8, s8
-; GCN-NEXT:    v_mov_b32_e32 v9, s9
-; GCN-NEXT:    v_mov_b32_e32 v10, s10
-; GCN-NEXT:    v_mov_b32_e32 v11, s11
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], v[4:7], off offset:16
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v9, s9
+; GCN-NEXT:    v_mov_b32_e32 v10, s10
+; GCN-NEXT:    v_mov_b32_e32 v11, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mov_b32_e32 v3, s15
@@ -113,7 +115,6 @@ define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %ar
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GCN-NEXT:    v_add_u32_e32 v0, v0, v2
-; GCN-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    buffer_load_dword v3, v0, s[0:3], s33 offen
@@ -131,6 +132,7 @@ define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %ar
 ; GCN-NEXT:    buffer_load_dword v15, v0, s[0:3], s33 offen offset:48
 ; GCN-NEXT:    buffer_load_dword v16, v0, s[0:3], s33 offen offset:52
 ; GCN-NEXT:    buffer_load_dword v17, v0, s[0:3], s33 offen offset:56
+; GCN-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], s33 offen offset:60
@@ -198,10 +200,12 @@ define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias noca
 ; GCN-LABEL: vector_clause_indirect:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_load_dwordx2 v[8:9], v[0:1], s[2:3]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
index c384eb5b425b..fea47db60d9e 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -7,9 +7,9 @@
 @tess_lds = external addrspace(3) global [8192 x i32]
 
 ; CHECK-LABEL: {{^}}main:
-; CHECK: ds_write_b32
-; CHECK: ds_write_b32
-; CHECK: v_mov_b32_e32 v1, v0
+; CHECK-DAG: ds_write_b32
+; CHECK-DAG: ds_write_b32
+; CHECK-DAG: v_mov_b32_e32 v1, v0
 ; CHECK: tbuffer_store_format_xyzw v[0:3],
 define amdgpu_vs void @main(i32 inreg %arg) {
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
new file mode 100644
index 000000000000..94ed449088a6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-post-ra-bundler %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name:            bundle_memops
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: bundle_memops
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN: S_NOP 0
+    ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit undef $vgpr3_vgpr4, implicit $exec {
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: S_NOP 0
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr5, implicit undef $vgpr0_vgpr1, implicit $exec, implicit undef $vgpr3_vgpr4 {
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: BUNDLE implicit undef $vgpr3_vgpr4, implicit $vgpr1, implicit $exec, implicit $vgpr0 {
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: S_NOP 0
+    ; GCN: BUNDLE implicit undef $vgpr3_vgpr4, implicit $vgpr1, implicit $exec, implicit $vgpr0 {
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: S_NOP 0
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN: S_NOP 0
+    ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $exec, implicit $vgpr1 {
+    ; GCN:   $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
+    ; GCN:   $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: BUNDLE implicit $vgpr0, implicit $vgpr2, implicit killed $m0, implicit $exec, implicit $vgpr3 {
+    ; GCN:   DS_WRITE_B32_gfx9 $vgpr0, $vgpr2, 0, 0, implicit killed $m0, implicit $exec
+    ; GCN:   DS_WRITE_B32_gfx9 $vgpr0, $vgpr3, 4, 0, implicit killed $m0, implicit $exec
+    ; GCN: }
+    ; GCN: S_NOP 0
+    ; GCN: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit undef $sgpr0_sgpr1, implicit undef $sgpr10 {
+    ; GCN:   $sgpr2 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0
+    ; GCN:   $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0, 0
+    ; GCN: }
+    ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr2, implicit $exec, implicit $vgpr1 {
+    ; GCN:   $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: BUNDLE implicit $vgpr0, implicit $vgpr2_vgpr3, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GCN:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+    ; GCN:   $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+    ; GCN:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: }
+    ; GCN: S_NOP 0
+    ; GCN: $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0
+    ; GCN: $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 128, 0, 0
+    ; GCN: S_NOP 0
+    ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $exec, implicit $vgpr1 {
+    ; GCN:   $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
+    ; GCN:   $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
+    ; GCN: }
+    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    S_NOP 0
+    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
+    S_NOP 0
+    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
+    $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    S_NOP 0
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
+    S_NOP 0
+    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    S_NOP 0
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
+    $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 $vgpr0, $vgpr2, 0, 0, implicit killed $m0, implicit $exec
+    DS_WRITE_B32_gfx9 $vgpr0, $vgpr3, 4, 0, implicit killed $m0, implicit $exec
+    S_NOP 0
+    $sgpr2 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0
+    $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0, 0
+    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3,  undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3,  undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    S_NOP 0
+    $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0
+    $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 128, 0, 0
+    S_NOP 0
+    $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
+    $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index b3ceb43a50f2..9aebce434fae 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -86,6 +86,7 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
@@ -94,7 +95,6 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
 ;
@@ -454,14 +454,14 @@ define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}

diff  --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index 219ea42a7112..4dc49963203d 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -191,9 +191,9 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v4, s6
-; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_mov_b32_e32 v6, s4
 ; VI-NEXT:    v_mov_b32_e32 v7, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    flat_load_dword v6, v[6:7]
 ; VI-NEXT:    flat_load_dword v4, v[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -215,9 +215,9 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s5
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9-NEXT:    global_load_dword v6, v[6:7], off
 ; GFX9-NEXT:    global_load_dword v4, v[4:5], off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index b1852190fa8d..da04c107fbcf 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -170,9 +170,8 @@ entry:
 ; CI.
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
-; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
-; GCN-NOHSA-NOT: v_add
-; CI-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
+; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
 ; CI-NOHSA-NOT: v_add
 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}

diff  --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 670610f4b92e..a7718496ee9a 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -101,10 +101,10 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
 
 ; GFX9_10-NOT: s_mov_b32 s5
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
 
-; GCN: s_mov_b32 s2, s5
+; GCN-DAG: s_mov_b32 s2, s5
 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -120,10 +120,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
 
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
 
-; GCN: s_mov_b32 s2, s5
+; GCN-DAG: s_mov_b32 s2, s5
 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx

diff  --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 16125bbcd78b..f023f7a7c8b1 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -11,17 +11,17 @@ define amdgpu_kernel void @select_f16(
 ; SI-NEXT:    s_mov_b32 s14, -1
 ; SI-NEXT:    s_mov_b32 s22, s14
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s16, s10
-; SI-NEXT:    s_mov_b32 s17, s11
-; SI-NEXT:    s_mov_b32 s10, s14
-; SI-NEXT:    s_mov_b32 s11, s15
 ; SI-NEXT:    s_mov_b32 s20, s6
 ; SI-NEXT:    s_mov_b32 s21, s7
 ; SI-NEXT:    s_mov_b32 s23, s15
+; SI-NEXT:    s_mov_b32 s16, s10
+; SI-NEXT:    s_mov_b32 s17, s11
 ; SI-NEXT:    s_mov_b32 s2, s14
 ; SI-NEXT:    s_mov_b32 s3, s15
 ; SI-NEXT:    s_mov_b32 s18, s14
 ; SI-NEXT:    s_mov_b32 s19, s15
+; SI-NEXT:    s_mov_b32 s10, s14
+; SI-NEXT:    s_mov_b32 s11, s15
 ; SI-NEXT:    buffer_load_ushort v0, off, s[20:23], 0
 ; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; SI-NEXT:    buffer_load_ushort v2, off, s[16:19], 0
@@ -52,17 +52,17 @@ define amdgpu_kernel void @select_f16(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_mov_b32 s16, s10
-; VI-NEXT:    s_mov_b32 s17, s11
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s10, s2
-; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_mov_b32 s16, s10
+; VI-NEXT:    s_mov_b32 s17, s11
 ; VI-NEXT:    s_mov_b32 s15, s3
 ; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; VI-NEXT:    buffer_load_ushort v2, off, s[16:19], 0
@@ -248,14 +248,14 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s12, s6
-; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    s_mov_b32 s16, s2
 ; SI-NEXT:    s_mov_b32 s17, s3
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
 ; SI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0
@@ -283,14 +283,14 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s12, s6
-; VI-NEXT:    s_mov_b32 s13, s7
 ; VI-NEXT:    s_mov_b32 s0, s2
 ; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s6, s10
-; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    s_mov_b32 s2, s10
 ; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s12, s6
+; VI-NEXT:    s_mov_b32 s13, s7
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; VI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
@@ -324,14 +324,14 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s12, s6
-; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    s_mov_b32 s16, s2
 ; SI-NEXT:    s_mov_b32 s17, s3
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
 ; SI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0
@@ -359,14 +359,14 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s12, s6
-; VI-NEXT:    s_mov_b32 s13, s7
 ; VI-NEXT:    s_mov_b32 s0, s2
 ; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s6, s10
-; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    s_mov_b32 s2, s10
 ; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s12, s6
+; VI-NEXT:    s_mov_b32 s13, s7
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; VI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
@@ -400,18 +400,18 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s22, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s16, s10
-; SI-NEXT:    s_mov_b32 s17, s11
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_mov_b32 s20, s6
 ; SI-NEXT:    s_mov_b32 s21, s7
 ; SI-NEXT:    s_mov_b32 s23, s3
+; SI-NEXT:    s_mov_b32 s16, s10
+; SI-NEXT:    s_mov_b32 s17, s11
 ; SI-NEXT:    s_mov_b32 s14, s2
 ; SI-NEXT:    s_mov_b32 s15, s3
-; SI-NEXT:    buffer_load_dword v0, off, s[20:23], 0
 ; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    buffer_load_dword v0, off, s[20:23], 0
 ; SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
 ; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
 ; SI-NEXT:    buffer_load_dword v3, off, s[16:19], 0
@@ -419,8 +419,6 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -428,7 +426,9 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -454,18 +454,18 @@ define amdgpu_kernel void @select_v2f16(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_mov_b32 s16, s10
-; VI-NEXT:    s_mov_b32 s17, s11
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s10, s2
-; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_mov_b32 s16, s10
+; VI-NEXT:    s_mov_b32 s17, s11
 ; VI-NEXT:    s_mov_b32 s15, s3
-; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
 ; VI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
 ; VI-NEXT:    buffer_load_dword v3, off, s[16:19], 0
@@ -506,15 +506,15 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s16, s2
-; SI-NEXT:    s_mov_b32 s17, s3
 ; SI-NEXT:    s_mov_b32 s12, s6
 ; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s14, s10
-; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s16, s2
+; SI-NEXT:    s_mov_b32 s17, s3
+; SI-NEXT:    s_mov_b32 s18, s10
+; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
@@ -556,12 +556,12 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_mov_b32 s12, s6
+; VI-NEXT:    s_mov_b32 s13, s7
 ; VI-NEXT:    s_mov_b32 s0, s2
 ; VI-NEXT:    s_mov_b32 s1, s3
 ; VI-NEXT:    s_mov_b32 s2, s10
 ; VI-NEXT:    s_mov_b32 s3, s11
-; VI-NEXT:    s_mov_b32 s12, s6
-; VI-NEXT:    s_mov_b32 s13, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
@@ -601,15 +601,15 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s16, s2
-; SI-NEXT:    s_mov_b32 s17, s3
 ; SI-NEXT:    s_mov_b32 s12, s6
 ; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s14, s10
-; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s16, s2
+; SI-NEXT:    s_mov_b32 s17, s3
+; SI-NEXT:    s_mov_b32 s18, s10
+; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
@@ -651,12 +651,12 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_mov_b32 s12, s6
+; VI-NEXT:    s_mov_b32 s13, s7
 ; VI-NEXT:    s_mov_b32 s0, s2
 ; VI-NEXT:    s_mov_b32 s1, s3
 ; VI-NEXT:    s_mov_b32 s2, s10
 ; VI-NEXT:    s_mov_b32 s3, s11
-; VI-NEXT:    s_mov_b32 s12, s6
-; VI-NEXT:    s_mov_b32 s13, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
@@ -705,25 +705,24 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    s_mov_b32 s16, s2
 ; SI-NEXT:    s_mov_b32 s17, s3
-; SI-NEXT:    buffer_load_dword v3, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v3, off, s[4:7], 0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0x3f200000
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
@@ -755,16 +754,15 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; VI-NEXT:    s_mov_b32 s2, s10
 ; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_dword v4, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v4, off, s[4:7], 0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3900
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v6, v5
@@ -802,25 +800,25 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    s_mov_b32 s16, s2
 ; SI-NEXT:    s_mov_b32 s17, s3
-; SI-NEXT:    buffer_load_dword v3, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v3, off, s[4:7], 0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0x3f200000
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
@@ -852,16 +850,15 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; VI-NEXT:    s_mov_b32 s2, s10
 ; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_dword v4, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v4, off, s[4:7], 0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3900
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 4adacbc126d8..ccf3f39a661d 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -163,9 +163,9 @@ define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)*
 ; GCN-LABEL: shl_i16_v_s:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
@@ -218,9 +218,9 @@ define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrs
 ; GCN-LABEL: shl_i16_v_compute_s:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 7968aa4c0264..deafd6f7fe10 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -146,8 +146,8 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-LABEL: shl_v_s_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
@@ -164,8 +164,8 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; VI-LABEL: shl_v_s_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
@@ -221,8 +221,8 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-LABEL: shl_s_v_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
@@ -239,8 +239,8 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; VI-LABEL: shl_s_v_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index e9a6ba989426..0cf12c9c992b 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -125,7 +125,7 @@ define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32
 ; GFX9-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x4
 ; GFX9-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xc
 
-; GCN: ds_write_b32
+; GCN-DAG: ds_write_b32
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
 define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
@@ -255,21 +255,19 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o
 ; CI: v_mov_b32
 ; CI: v_mov_b32
 
-; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-
 ; CI: v_add_i32
 ; CI: v_add_i32
 
+; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
-
 ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
 ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
 
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44
 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
 ; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44
 
 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36
 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52

diff  --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 70c1ad962344..ef9bf7619512 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -556,9 +556,9 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 add
 ; VI-NEXT:    s_mov_b32 s7, s3
 ; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 184082dd3625..43d55475d805 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -190,10 +190,10 @@ define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i
 }
 
 ; GCN-LABEL: {{^}}s_min_max_v4i16:
-; GFX9: v_pk_max_i16
-; GFX9: v_pk_min_i16
-; GFX9: v_pk_max_i16
-; GFX9: v_pk_min_i16
+; GFX9-DAG: v_pk_max_i16
+; GFX9-DAG: v_pk_min_i16
+; GFX9-DAG: v_pk_max_i16
+; GFX9-DAG: v_pk_min_i16
 define amdgpu_kernel void @s_min_max_v4i16(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
   %cond0 = icmp sgt <4 x i16> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index f5857a330fab..d03d09f2be56 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -215,8 +215,8 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
 ; CIVI-LABEL: local_store_i17:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIVI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; CIVI-NEXT:    s_mov_b32 m0, -1
+; CIVI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; CIVI-NEXT:    ds_write_b16 v0, v1
 ; CIVI-NEXT:    ds_write_b8 v0, v2 offset:2
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index e503fbb51ad7..f928a320058b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -352,10 +352,10 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}mac_v2f16_same_add:
-; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI-DAG:  v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-DAG:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

diff  --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index d61c1743114c..5f771bdbe6ff 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -41,9 +41,9 @@ define amdgpu_kernel void @madak_f16(
 ; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -75,11 +75,11 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s10
 ; SI-NEXT:    s_mov_b32 s17, s11
+; SI-NEXT:    s_mov_b32 s10, s14
+; SI-NEXT:    s_mov_b32 s11, s15
 ; SI-NEXT:    s_mov_b32 s3, s15
 ; SI-NEXT:    s_mov_b32 s18, s14
 ; SI-NEXT:    s_mov_b32 s19, s15
-; SI-NEXT:    s_mov_b32 s10, s14
-; SI-NEXT:    s_mov_b32 s11, s15
 ; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 ; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
 ; SI-NEXT:    buffer_load_ushort v2, off, s[0:3], 0
@@ -112,11 +112,11 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s16, s10
 ; VI-NEXT:    s_mov_b32 s17, s11
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s15, s3
 ; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
-; VI-NEXT:    s_mov_b32 s10, s2
-; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 ; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
 ; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0


        


More information about the llvm-commits mailing list