[llvm] 86c944d - AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets

Fri Jan 24 20:14:18 PST 2020

Author: Tom Stellard
Date: 2020-01-24T19:45:56-08:00
New Revision: 86c944d790728891801778b8d98c2c65a83f36a5

URL: https://github.com/llvm/llvm-project/commit/86c944d790728891801778b8d98c2c65a83f36a5
DIFF: https://github.com/llvm/llvm-project/commit/86c944d790728891801778b8d98c2c65a83f36a5.diff

LOG: AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets

Summary:
This improves merging of sequences like:

store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr

Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:

store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr

With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block.  We now transform this sequnce into:

store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16

Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.

Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin

Reviewed By: arsenm, nhaehnle

Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65966

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
    llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
    llvm/test/CodeGen/AMDGPU/merge-load-store.mir
    llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index d2b1abc8a9fb..364196b1d6e1 100644

--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -126,10 +126,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     bool SLC;
     bool DLC;
     bool UseST64;
-    SmallVector<MachineInstr *, 8> InstsToMove;
     int AddrIdx[5];
     const MachineOperand *AddrReg[5];
     unsigned NumAddresses;
+    unsigned Order;
 
     bool hasSameBaseAddress(const MachineInstr &MI) {
       for (unsigned i = 0; i < NumAddresses; i++) {
@@ -210,7 +210,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                   const SIInstrInfo &TII,
                                   const CombineInfo &Paired);
   static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI,
-                                   CombineInfo &Paired);
+                                   CombineInfo &Paired, bool Modify = false);
   static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI,
                         const CombineInfo &Paired);
   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
@@ -219,21 +219,38 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
                                                     const CombineInfo &Paired);
 
-  bool findMatchingInst(CombineInfo &CI, CombineInfo &Paired);
+  bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
+                            SmallVectorImpl<MachineInstr *> &InstsToMove);
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired);
+  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
+                                             CombineInfo &Paired,
+                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
 
   unsigned write2Opcode(unsigned EltSize) const;
   unsigned write2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired);
+  MachineBasicBlock::iterator
+  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
+                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
+                 const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
+                          const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
+                      const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
+                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
+                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
+                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
 
   void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
                            int32_t NewOffset) const;
@@ -259,8 +276,6 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
-  void removeCombinedInst(std::list<CombineInfo> &MergeList,
-                                         const MachineInstr &MI);
   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
                                      bool &OptimizeListAgain);
   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
@@ -547,8 +562,6 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
     AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
     AddrReg[i] = &I->getOperand(AddrIdx[i]);
   }
-
-  InstsToMove.clear();
 }
 
 } // end anonymous namespace.
@@ -720,7 +733,8 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
 
 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
                                                 const MCSubtargetInfo &STI,
-                                                CombineInfo &Paired) {
+                                                CombineInfo &Paired,
+                                                bool Modify) {
   assert(CI.InstClass != MIMG);
 
   // XXX - Would the same offset be OK? Is there any reason this would happen or
@@ -761,7 +775,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
   CI.UseST64 = false;
   CI.BaseOff = 0;
 
-  // Handle SMEM and VMEM instructions.
+  // Handle DS instructions.
   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
     return (EltOffset0 + CI.Width == EltOffset1 ||
             EltOffset1 + Paired.Width == EltOffset0) &&
@@ -769,20 +783,25 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
   }
 
+  // Handle SMEM and VMEM instructions.
   // If the offset in elements doesn't fit in 8-bits, we might be able to use
   // the stride 64 versions.
   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
-    CI.Offset = EltOffset0 / 64;
-    Paired.Offset = EltOffset1 / 64;
-    CI.UseST64 = true;
+    if (Modify) {
+      CI.Offset = EltOffset0 / 64;
+      Paired.Offset = EltOffset1 / 64;
+      CI.UseST64 = true;
+    }
     return true;
   }
 
   // Check if the new offsets fit in the reduced 8-bit range.
   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
-    CI.Offset = EltOffset0;
-    Paired.Offset = EltOffset1;
+    if (Modify) {
+      CI.Offset = EltOffset0;
+      Paired.Offset = EltOffset1;
+    }
     return true;
   }
 
@@ -791,15 +810,19 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
   CI.BaseOff = std::min(CI.Offset, Paired.Offset);
 
   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
-    CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
-    Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
-    CI.UseST64 = true;
+    if (Modify) {
+      CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
+      Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+      CI.UseST64 = true;
+    }
     return true;
   }
 
   if (isUInt<8>(OffsetDiff)) {
-    CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
-    Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+    if (Modify) {
+      CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
+      Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+    }
     return true;
   }
 
@@ -824,11 +847,19 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
   }
 }
 
-bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
-                                            CombineInfo &Paired) {
-  MachineBasicBlock *MBB = CI.I->getParent();
-  MachineBasicBlock::iterator E = MBB->end();
-  MachineBasicBlock::iterator MBBI = CI.I;
+/// This function assumes that CI comes before Paired in a basic block.
+bool SILoadStoreOptimizer::checkAndPrepareMerge(
+    CombineInfo &CI, CombineInfo &Paired,
+    SmallVectorImpl<MachineInstr *> &InstsToMove) {
+
+  // Check both offsets (or masks for MIMG) can be combined and fit in the
+  // reduced range.
+  if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
+    return false;
+
+  if (CI.InstClass != MIMG &&
+      (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STI, Paired)))
+    return false;
 
   const unsigned Opc = CI.I->getOpcode();
   const InstClassEnum InstClass = getInstClass(Opc, *TII);
@@ -844,12 +875,12 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
   if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
     return false;
 
-  ++MBBI;
-
   DenseSet<unsigned> RegDefsToMove;
   DenseSet<unsigned> PhysRegUsesToMove;
   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
+  MachineBasicBlock::iterator E = std::next(Paired.I);
+  MachineBasicBlock::iterator MBBI = std::next(CI.I);
   for (; MBBI != E; ++MBBI) {
 
     if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
@@ -868,11 +899,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
 
       if (MBBI->mayLoadOrStore() &&
           (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-           !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
+           !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
-        CI.InstsToMove.push_back(&*MBBI);
+        InstsToMove.push_back(&*MBBI);
         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
         continue;
       }
@@ -881,7 +912,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                            CI.InstsToMove);
+                            InstsToMove);
       continue;
     }
 
@@ -901,26 +932,24 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     // merging of the two writes.
     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                              CI.InstsToMove))
+                              InstsToMove))
       continue;
 
-    bool Match = CI.hasSameBaseAddress(*MBBI);
-
-    if (Match) {
-      Paired.setMI(MBBI, *TII, *STM);
-
-      // Check both offsets (or masks for MIMG) can be combined and fit in the
-      // reduced range.
-      bool canBeCombined =
-          CI.InstClass == MIMG
-              ? dmasksCanBeCombined(CI, *TII, Paired)
-              : widthsFit(*STM, CI, Paired) && offsetsCanBeCombined(CI, *STI, Paired);
-
-      // We also need to go through the list of instructions that we plan to
+    if (&*MBBI == &*Paired.I) {
+      // We need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
-      if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+      if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
+
+        // Call offsetsCanBeCombined with modify = true so that the offsets are
+        // correct for the new instruction.  This should return true, because
+        // this function should only be called on CombineInfo objects that
+        // have already been confirmed to be mergeable.
+        if (CI.InstClass != MIMG)
+          offsetsCanBeCombined(CI, *STI, Paired, true);
         return true;
+      }
+      return false;
     }
 
     // We've found a load/store that we couldn't merge for some reason.
@@ -929,7 +958,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
     // down past this instruction.
     // check if we can move I across MBBI and if we can move all I's users
     if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+        !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
       break;
   }
   return false;
@@ -950,7 +979,8 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -1023,7 +1053,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) {
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1049,7 +1079,8 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
+                                      const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1106,7 +1137,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  moveInstsAfter(Write2, CI.InstsToMove);
+  moveInstsAfter(Write2, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1116,7 +1147,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
+                           const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1161,15 +1193,16 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) {
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1211,15 +1244,16 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Pair
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1273,15 +1307,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1340,15 +1375,16 @@ SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1403,7 +1439,7 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired
           .addMemOperand(
               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
-  moveInstsAfter(MIB, CI.InstsToMove);
+  moveInstsAfter(MIB, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1509,8 +1545,9 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
   }
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1561,7 +1598,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired)
         .addImm(0)            // swz
         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
-  moveInstsAfter(MIB, CI.InstsToMove);
+  moveInstsAfter(MIB, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1904,6 +1941,7 @@ bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
   SmallPtrSet<MachineInstr *, 4> AnchorList;
 
   // Sort potential mergeable instructions into lists.  One list per base address.
+  unsigned Order = 0;
   for (MachineInstr &MI : MBB.instrs()) {
     // We run this before checking if an address is mergeable, because it can produce
     // better code even if the instructions aren't mergeable.
@@ -1920,12 +1958,43 @@ bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
 
     CombineInfo CI;
     CI.setMI(MI, *TII, *STM);
+    CI.Order = Order++;
 
     if (!CI.hasMergeableAddress(*MRI))
       continue;
 
     addInstToMergeableList(CI, MergeableInsts);
   }
+
+  // At this point we have lists of Mergeable instructions.
+  //
+  // Part 2: Sort lists by offset and then for each CombineInfo object in the
+  // list try to find an instruction that can be merged with I.  If an instruction
+  // is found, it is stored in the Paired field.  If no instructions are found, then
+  // the CombineInfo object is deleted from the list.
+  
+  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
+                                                   E = MergeableInsts.end(); I != E;) {
+
+    std::list<CombineInfo> &MergeList = *I;
+    if (MergeList.size() <= 1) {
+      // This means we have found only one instruction with a given address
+      // that can be merged, and we need at least 2 instructions to do a merge,
+      // so this list can be discarded.
+      I = MergeableInsts.erase(I);
+      continue;
+    }
+
+    // Sort the lists by offsets, this way mergeable instructions will be
+    // adjacent to each other in the list, which will make it easier to find
+    // matches.
+    MergeList.sort(
+        [] (const CombineInfo &A, CombineInfo &B) {
+          return A.Offset < B.Offset;
+        });
+    ++I;
+  }
+
   return Modified;
 }
 
@@ -1936,117 +2005,124 @@ bool SILoadStoreOptimizer::optimizeBlock(
                        std::list<std::list<CombineInfo> > &MergeableInsts) {
   bool Modified = false;
 
-  for (std::list<CombineInfo> &MergeList : MergeableInsts) {
-    if (MergeList.size() < 2)
-      continue;
+  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
+                                                   E = MergeableInsts.end(); I != E;) {
+    std::list<CombineInfo> &MergeList = *I;
 
     bool OptimizeListAgain = false;
     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
-      // We weren't able to make any changes, so clear the list so we don't
+      // We weren't able to make any changes, so delete the list so we don't
       // process the same instructions the next time we try to optimize this
       // block.
-      MergeList.clear();
+      I = MergeableInsts.erase(I);
       continue;
     }
 
-    // We made changes, but also determined that there were no more optimization
-    // opportunities, so we don't need to reprocess the list
-    if (!OptimizeListAgain)
-      MergeList.clear();
-
-    OptimizeAgain |= OptimizeListAgain;
     Modified = true;
-  }
-  return Modified;
-}
-
-void
-SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
-                                         const MachineInstr &MI) {
 
-  for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
-    if (&*CI->I == &MI) {
-      MergeList.erase(CI);
-      return;
+    // We made changes, but also determined that there were no more optimization
+    // opportunities, so we don't need to reprocess the list
+    if (!OptimizeListAgain) {
+      I = MergeableInsts.erase(I);
+      continue;
     }
+    OptimizeAgain = true;
   }
+  return Modified;
 }
 
 bool
 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
                                           std::list<CombineInfo> &MergeList,
                                           bool &OptimizeListAgain) {
+  if (MergeList.empty())
+    return false;
+
   bool Modified = false;
-  for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
-    CombineInfo &CI = *I;
-    CombineInfo Paired;
 
-    if (CI.InstClass == UNKNOWN)
-      continue;
+  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
+       Next = std::next(I)) {
+
+    auto First = I;
+    auto Second = Next;
+
+    if ((*First).Order > (*Second).Order)
+      std::swap(First, Second);
+    CombineInfo &CI = *First;
+    CombineInfo &Paired = *Second;
 
-    if (!findMatchingInst(CI, Paired))
-      goto done;
+    SmallVector<MachineInstr *, 8> InstsToMove;
+    if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
+      ++I;
+      continue;
+    }
 
     Modified = true;
-    removeCombinedInst(MergeList, *Paired.I);
 
     switch (CI.InstClass) {
     default:
       llvm_unreachable("unknown InstClass");
       break;
     case DS_READ: {
-      MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeRead2Pair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       break;
     }
     case DS_WRITE: {
-      MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeWrite2Pair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       break;
     }
     case S_BUFFER_LOAD_IMM: {
-      MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
       break;
     }
     case BUFFER_LOAD: {
-      MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeBufferLoadPair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case BUFFER_STORE: {
-      MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeBufferStorePair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case MIMG: {
-      MachineBasicBlock::iterator NewMI = mergeImagePair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeImagePair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case TBUFFER_LOAD: {
-      MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeTBufferLoadPair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case TBUFFER_STORE: {
-      MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeTBufferStorePair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     }
+    CI.Order = Paired.Order;
+    if (I == Second)
+      I = Next;
 
-done:
-    // Clear the InstsToMove after we have finished searching so we don't have
-    // stale values left over if we search for this CI again in another pass
-    // over the block.
-    CI.InstsToMove.clear();
+    MergeList.erase(Second);
   }
 
   return Modified;

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 9668743cf128..15fa757f0416 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -6,9 +6,9 @@
 ; offset0 is larger than offset1
 
 ; SI-LABEL: {{^}}offset_order:
-; SI-DAG: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}}
+; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:14{{$}}
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1024
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
 define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
index 14fd6201a14d..dcc58ba956e3 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
@@ -66,6 +66,7 @@
   attributes #0 = { convergent nounwind }
 
   define amdgpu_kernel void @merge_mmos(i32 addrspace(1)* %ptr_addr1) { ret void }
+  define amdgpu_kernel void @reorder_offsets(i32 addrspace(1)* %reorder_addr1) { ret void }
 
 ...
 ---
@@ -194,3 +195,26 @@ body: |
     S_ENDPGM 0
 
 ...
+---
+# CHECK-LABEL: reorder_offsets
+# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1)
+# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1)
+
+name: reorder_offsets
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+    %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1)
+    S_ENDPGM 0
+
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
index 41b83643d91b..1f10a047f7d9 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
@@ -133,17 +133,17 @@ body:             |
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1
-# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3
+# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
+# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
 
 name: gfx9_tbuffer_load_float_32
 body:             |
@@ -170,17 +170,17 @@ body:             |
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1
-# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3
+# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
+# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
 
 name: gfx9_tbuffer_load_sint_32
 body:             |
@@ -207,17 +207,17 @@ body:             |
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1
-# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3
+# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
+# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0
-# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
+# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
 
 name: gfx9_tbuffer_load_uint_32
 body:             |
@@ -492,11 +492,11 @@ body:             |
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
+# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2
+# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx9_tbuffer_store_float32
 body:             |
@@ -546,11 +546,11 @@ body:             |
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
+# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2
+# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx9_tbuffer_store_sint32
 body:             |
@@ -600,11 +600,11 @@ body:             |
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
+# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
-# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2
+# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx9_tbuffer_store_uint32
 body:             |
@@ -911,17 +911,17 @@ body:             |
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1
-# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3
+# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
+# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
 
 name: gfx10_tbuffer_load_float_32
 body:             |
@@ -948,17 +948,17 @@ body:             |
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1
-# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3
+# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
+# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
 
 name: gfx10_tbuffer_load_sint_32
 body:             |
@@ -985,17 +985,17 @@ body:             |
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1
-# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3
+# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
+# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0
-# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
+# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
 
 name: gfx10_tbuffer_load_uint_32
 body:             |
@@ -1271,11 +1271,11 @@ body:             |
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
+# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2
+# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx10_tbuffer_store_float32
 body:             |
@@ -1325,11 +1325,11 @@ body:             |
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
+# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2
+# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx10_tbuffer_store_sint32
 body:             |
@@ -1379,11 +1379,11 @@ body:             |
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
+# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
-# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2
+# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx10_tbuffer_store_uint32
 body:             |